1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.io;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.io.Writer;
28 import java.util.Iterator;
29
30 import org.apache.commons.lang.StringUtils;
31
32 import de.fu_berlin.ties.ContextMap;
33 import de.fu_berlin.ties.ProcessingException;
34 import de.fu_berlin.ties.TextProcessor;
35 import de.fu_berlin.ties.TiesConfiguration;
36 import de.fu_berlin.ties.text.TextUtils;
37
38 /***
39 * Externalizes the contents of a file in
40 * {@linkplain de.fu_berlin.ties.io.DelimSepValues DSV format} (or any other
41 * {@link de.fu_berlin.ties.io.FieldContainer}. For each entry, the contents
42 * of one specified field (read from the value of the {@link #CONFIG_KEY}
43 * configuration parameter) are stored in an external file. The base name of the
44 * external file (without file extension) is stored in the output DSV file
45 * instead of its content.
46 *
47 * <p>Base name and extension of the external files are determined from the
48 * input file. For example, if the input file is named <em>file.data</em> and
49 * contains 87 entries, 87 externalized files named <em>file01.data</em>,
50 * <em>file02.data</em>, ..., <em>file87.data</em> will be created
51 * (the number of leading zeros is determined as required to ensure that all
52 * file names have the same length). Entries are skipped (but still counted for
53 * numbering purposes) if the value of the specified field is empty or missing.
54 *
55 * <p>Instances of this class are thread-safe.
56 *
57 * @author Christian Siefkes
58 * @version $Revision: 1.7 $, $Date: 2006/10/21 16:04:22 $, $Author: siefkes $
59 */
60 public class Externalize extends TextProcessor {
61
62 /***
63 * Configuration key: The name of the field whose contents to externalize:
64 * {@value}.
65 */
66 public static final String CONFIG_KEY = "externalize.key";
67
68
69 /***
70 * Creates a new instance, using the
71 * {@linkplain TiesConfiguration#CONF standard configuration}.
72 *
73 * @param outExt the extension to use for output files
74 */
75 public Externalize(final String outExt) {
76 super(outExt);
77 }
78
79 /***
80 * Creates a new instance.
81 *
82 * @param outExt the extension to use for output files
83 * @param conf used to configure this instance; if <code>null</code>,
84 * the {@linkplain TiesConfiguration#CONF standard configuration} is used
85 */
86 public Externalize(final String outExt, final TiesConfiguration conf) {
87 super(outExt, conf);
88 }
89
90 /***
91 * Externalizes the contents of a field container. This method delegates
92 * to {@link #externalize(FieldContainer, File, String, String, String)},
93 * determining the name of the field to externalize from the
94 * {@link #CONFIG_KEY} configuration parameter.
95 *
96 * @param container the container to externalize; will be modified by
97 * replacing the values stored in the <code>key</code> field with the base
98 * names (without extension) of the newly created external files containing
99 * them
100 * @param directory the directory to use for storing the externalized files;
101 * if <code>null</code>, the working directory is used
102 * @param localName the name of the input file, used to determine the names
103 * of externalized files
104 * @param charset the character set to use for the external files;
105 * if <code>null</code>, the default charset of the current platform is used
106 * @throws IOException if an I/O error occurs while writing the external
107 * files
108 */
109 public void externalize(final FieldContainer container,
110 final File directory, final String localName, final String charset)
111 throws IOException {
112 externalize(container, directory, localName, charset,
113 getConfig().getString(CONFIG_KEY));
114 }
115
116 /***
117 * Externalizes the contents of a field container.
118 *
119 * @param container the container to externalize; will be modified by
120 * replacing the values stored in the <code>key</code> field with the base
121 * names (without extension) of the newly created external files containing
122 * them
123 * @param directory the directory to use for storing the externalized files;
124 * if <code>null</code>, the working directory is used
125 * @param localName the name of the input file, used to determine the names
126 * of externalized files
127 * @param charset the character set to use for the external files;
128 * if <code>null</code>, the default charset of the current platform is used
129 * @param key the name of the field to externalize
130 * @throws IOException if an I/O error occurs while writing the external
131 * files
132 */
133 public void externalize(final FieldContainer container,
134 final File directory, final String localName, final String charset,
135 final String key)
136 throws IOException {
137 final String baseName = IOUtils.getBaseName(localName);
138 final String extension = IOUtils.getExtension(localName);
139
140
141 final int numDigits = String.valueOf(container.size()).length();
142
143
144 int i = 1;
145 final Iterator<FieldMap> entryIter = container.entryIterator();
146 FieldMap entry;
147 String value;
148 String extFileBase;
149 String extFileName;
150 File extFile;
151 Writer extWriter;
152
153 while (entryIter.hasNext()) {
154 entry = entryIter.next();
155 value = (String) entry.get(key);
156
157
158 if (StringUtils.isNotEmpty(value)) {
159
160 extFileBase = baseName + StringUtils.leftPad(
161 Integer.toString(i), numDigits, '0');
162 extFileName = extFileBase + IOUtils.EXT_SEPARATOR + extension;
163 extFile = new File(directory, extFileName);
164 extWriter = IOUtils.openWriter(extFile, charset);
165
166
167 try {
168 IOUtils.writeToWriter(value, extWriter);
169 extWriter.write(TextUtils.LINE_SEPARATOR);
170 } finally {
171 IOUtils.tryToClose(extWriter);
172 }
173
174
175 entry.put(key, extFileBase);
176 }
177
178 i++;
179 }
180 }
181
182 /***
183 * {@inheritDoc} This implementation delegates to
184 * {@link #externalize(FieldContainer, File, String, String)}, using
185 * {@linkplain DelimSepValues DSV format} for input and output.
186 */
187 protected void doProcess(final Reader reader, final Writer writer,
188 final ContextMap context) throws IOException, ProcessingException {
189
190 final FieldContainer container = new DelimSepValues(getConfig());
191 container.read(reader);
192 externalize(container, (File) context.get(KEY_OUT_DIRECTORY),
193 (String) context.get(KEY_LOCAL_NAME),
194 (String) context.get(IOUtils.KEY_LOCAL_CHARSET));
195 container.store(writer);
196 }
197
198 }