1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.io;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.io.Writer;
28 import java.util.regex.Pattern;
29
30 import org.apache.commons.lang.StringUtils;
31
32 import de.fu_berlin.ties.ContextMap;
33 import de.fu_berlin.ties.ProcessingException;
34 import de.fu_berlin.ties.TextProcessor;
35 import de.fu_berlin.ties.TiesConfiguration;
36
37 /***
38 * Uses a given pattern to split an input file into a series of output files.
39 * Base name and extension of the output files are determined from the
40 * input file. For example, if the input file is named <em>file.data</em> and
41 * contains 87 sections (separated by the configured pattern), 87 output files
42 * named <em>file01.data</em>, <em>file02.data</em>, ..., <em>file87.data</em>
43 * will be created (the number of leading zeros is determined as required to
44 * ensure that all file names have the same length).
45 *
46 * <p>If a splitted sequence is empty or contains only whitespace characters,
47 * it is not stored and not counted (but since we don't know in advance how
48 * many sequences are empty/blank, they are still considered when determining
49 * the number of leading zeros that are prepended to filenames).
50 *
51 * <p>Instances of this class are thread-safe. It's also possible to use the
52 * static {@link #split(Reader, File, String, String, Pattern)} method without
53 * creating an instance.
54 *
55 * @author Christian Siefkes
56 * @version $Revision: 1.6 $, $Date: 2006/10/21 16:04:22 $, $Author: siefkes $
57 */
58 public class Split extends TextProcessor {
59
60 /***
61 * Configuration key: The default pattern used to split input: {@value}.
62 */
63 public static final String CONFIG_PATTERN = "split.pattern";
64
65
66 /***
67 * Splits an input file into a series of output files, calling
68 * {@link Pattern#split(java.lang.CharSequence)} and storing each member
69 * of the returned array in a separate file.
70 *
71 * @param reader reader containing the text to process; not closed by this
72 * method
73 * @param directory the directory to use for storing the output files;
74 * if <code>null</code>, the working directory is used
75 * @param localName the name of the input file, used to determine the names
76 * of output files
77 * @param charset the character set to use for the external files;
78 * if <code>null</code>, the default charset of the current platform is used
79 * @param pattern the pattern used to split the input
80 * @throws IllegalArgumentException if <code>pattern</code> is
81 * <code>null</code>
82 * @throws IOException if an I/O error occurs while writing the output
83 * files
84 */
85 public static void split(final Reader reader, final File directory,
86 final String localName, final String charset, final Pattern pattern)
87 throws IllegalArgumentException, IOException {
88
89 final String[] splitted;
90 if (pattern != null) {
91 final String input = IOUtils.readToString(reader);
92 splitted = pattern.split(input);
93 } else {
94 throw new IllegalArgumentException("Split pattern is null");
95 }
96
97
98 final String baseName = IOUtils.getBaseName(localName);
99 final String extension = IOUtils.getExtension(localName);
100
101
102 final int numDigits = String.valueOf(splitted.length).length();
103
104
105 String extFileBase;
106 String extFileName;
107 File extFile;
108 Writer extWriter;
109 int counter = 1;
110
111 for (int i = 0; i < splitted.length; i++) {
112
113 if (StringUtils.isNotBlank(splitted[i])) {
114
115 extFileBase = baseName + StringUtils.leftPad(
116 Integer.toString(i), numDigits, '0');
117 extFileName = extFileBase + IOUtils.EXT_SEPARATOR + extension;
118 extFile = new File(directory, extFileName);
119 extWriter = IOUtils.openWriter(extFile, charset);
120
121
122 try {
123 IOUtils.writeToWriter(splitted[i], extWriter);
124 } finally {
125 IOUtils.tryToClose(extWriter);
126 }
127
128
129 counter++;
130 }
131 }
132 }
133
134
135 /***
136 * The default pattern used to split input; might be <code>null</code>.
137 */
138 private final Pattern defaultPattern;
139
140
141 /***
142 * Creates a new instance, using the
143 * {@linkplain TiesConfiguration#CONF standard configuration}.
144 */
145 public Split() {
146 this(TiesConfiguration.CONF);
147 }
148
149 /***
150 * Creates a new instance.
151 *
152 * @param conf used to configure this instance
153 */
154 public Split(final TiesConfiguration conf) {
155
156 super("tmp", conf);
157 final String patternString = conf.getString(CONFIG_PATTERN, null);
158 defaultPattern = (patternString != null)
159 ? Pattern.compile(patternString) : null;
160 }
161
162
163 /***
164 * {@inheritDoc} This implementation delegates to
165 * {@link #split(Reader, File, String, String)}.
166 */
167 protected void doProcess(final Reader reader, final Writer writer,
168 final ContextMap context) throws IOException, ProcessingException {
169 split(reader, (File) context.get(KEY_OUT_DIRECTORY),
170 (String) context.get(KEY_LOCAL_NAME),
171 (String) context.get(IOUtils.KEY_LOCAL_CHARSET));
172 }
173
174 /***
175 * Delegates to the static
176 * {@link #split(Reader, File, String, String, Pattern)} method, using the
177 * configured default pattern.
178 *
179 * @param reader reader containing the text to process; not closed by this
180 * method
181 * @param directory the directory to use for storing the output files;
182 * if <code>null</code>, the working directory is used
183 * @param localName the name of the input file, used to determine the names
184 * of output files
185 * @param charset the character set to use for the external files;
186 * if <code>null</code>, the default charset of the current platform is used
187 * @throws IllegalArgumentException if the configured default pattern is
188 * <code>null</code>
189 * @throws IOException if an I/O error occurs while writing the output
190 * files
191 */
192 public void split(final Reader reader, final File directory,
193 final String localName, final String charset)
194 throws IllegalArgumentException, IOException {
195 split(reader, directory, localName, charset, defaultPattern);
196 }
197
198 }