View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.io;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Reader;
27  import java.io.Writer;
28  import java.util.regex.Pattern;
29  
30  import org.apache.commons.lang.StringUtils;
31  
32  import de.fu_berlin.ties.ContextMap;
33  import de.fu_berlin.ties.ProcessingException;
34  import de.fu_berlin.ties.TextProcessor;
35  import de.fu_berlin.ties.TiesConfiguration;
36  
37  /***
38   * Uses a given pattern to split an input file into a series of output files.
39   * Base name and extension of the output files are determined from the
40   * input file. For example, if the input file is named <em>file.data</em> and
41   * contains 87 sections (separated by the configured pattern), 87 output files
42   * named <em>file01.data</em>, <em>file02.data</em>, ..., <em>file87.data</em>
43   * will be created (the number of leading zeros is determined as required to
44   * ensure that all file names have the same length).
45   *
46   * <p>If a splitted sequence is empty or contains only whitespace characters,
47   * it is not stored and not counted (but since we don't know in advance how
48   * many sequences are empty/blank, they are still considered when determining
49   * the number of leading zeros that are prepended to filenames).
50   *
51   * <p>Instances of this class are thread-safe. It's also possible to use the
52   * static {@link #split(Reader, File, String, String, Pattern)} method without
53   * creating an instance.
54   *
55   * @author Christian Siefkes
56   * @version $Revision: 1.6 $, $Date: 2006/10/21 16:04:22 $, $Author: siefkes $
57   */
58  public class Split extends TextProcessor {
59  
60      /***
61       * Configuration key: The default pattern used to split input: {@value}.
62       */
63      public static final String CONFIG_PATTERN = "split.pattern";
64  
65  
66      /***
67       * Splits an input file into a series of output files, calling
68       * {@link Pattern#split(java.lang.CharSequence)} and storing each member
69       * of the returned array in a separate file.
70       *
71       * @param reader reader containing the text to process; not closed by this
72       * method
73       * @param directory the directory to use for storing the output files;
74       * if <code>null</code>, the working directory is used
75       * @param localName the name of the input file, used to determine the names
76       * of output files
77       * @param charset the character set to use for the external files;
78       * if <code>null</code>, the default charset of the current platform is used
79       * @param pattern the pattern used to split the input
80       * @throws IllegalArgumentException if <code>pattern</code> is
81       * <code>null</code>
82       * @throws IOException if an I/O error occurs while writing the output
83       * files
84       */
85      public static void split(final Reader reader, final File directory,
86              final String localName, final String charset, final Pattern pattern)
87      throws IllegalArgumentException, IOException {
88          // check that pattern is not null + use to split input
89          final String[] splitted;
90          if (pattern != null) {
91              final String input = IOUtils.readToString(reader);
92              splitted = pattern.split(input);
93          } else {
94              throw new IllegalArgumentException("Split pattern is null");
95          }
96  
97          // determine name components
98          final String baseName = IOUtils.getBaseName(localName);
99          final String extension = IOUtils.getExtension(localName);
100 
101         // number of digits to ensure that all numbers have same length 
102         final int numDigits = String.valueOf(splitted.length).length();
103 
104         // iterate array elements, counting from 1
105         String extFileBase;
106         String extFileName;
107         File extFile;
108         Writer extWriter;
109         int counter = 1;
110 
111         for (int i = 0; i < splitted.length; i++) {
112             // skip empty/blank entries
113             if (StringUtils.isNotBlank(splitted[i])) {
114                 // left-pad with zeros so all numbers have same length
115                 extFileBase = baseName + StringUtils.leftPad(
116                         Integer.toString(i), numDigits, '0');
117                 extFileName = extFileBase + IOUtils.EXT_SEPARATOR + extension;
118                 extFile = new File(directory, extFileName);
119                 extWriter = IOUtils.openWriter(extFile, charset);
120 
121                 // write text to external file
122                 try {
123                     IOUtils.writeToWriter(splitted[i], extWriter);
124                 } finally {
125                     IOUtils.tryToClose(extWriter);
126                 }
127 
128                 // counter is not modified for empty/blank entires
129                 counter++;
130             }
131         }
132     }
133 
134 
135     /***
136      * The default pattern used to split input; might be <code>null</code>.
137      */
138     private final Pattern defaultPattern;
139 
140 
141     /***
142      * Creates a new instance, using the
143      * {@linkplain TiesConfiguration#CONF standard configuration}.
144      */
145     public Split() {
146         this(TiesConfiguration.CONF);
147     }
148 
149     /***
150      * Creates a new instance.
151      *
152      * @param conf used to configure this instance
153      */
154     public Split(final TiesConfiguration conf) {
155         // output extension is just a dummy since we re-use the input extension
156         super("tmp", conf);
157         final String patternString = conf.getString(CONFIG_PATTERN, null);
158         defaultPattern = (patternString != null)
159                 ? Pattern.compile(patternString) : null;
160     }
161 
162 
163     /***
164      * {@inheritDoc} This implementation delegates to
165      * {@link #split(Reader, File, String, String)}.
166      */
167     protected void doProcess(final Reader reader, final Writer writer,
168             final ContextMap context) throws IOException, ProcessingException {
169         split(reader, (File) context.get(KEY_OUT_DIRECTORY),
170                 (String) context.get(KEY_LOCAL_NAME),
171                 (String) context.get(IOUtils.KEY_LOCAL_CHARSET));
172     }
173 
174     /***
175      * Delegates to the static
176      * {@link #split(Reader, File, String, String, Pattern)} method, using the
177      * configured default pattern.
178      *
179      * @param reader reader containing the text to process; not closed by this
180      * method
181      * @param directory the directory to use for storing the output files;
182      * if <code>null</code>, the working directory is used
183      * @param localName the name of the input file, used to determine the names
184      * of output files
185      * @param charset the character set to use for the external files;
186      * if <code>null</code>, the default charset of the current platform is used
187      * @throws IllegalArgumentException if the configured default pattern is
188      * <code>null</code>
189      * @throws IOException if an I/O error occurs while writing the output
190      * files
191      */
192     public void split(final Reader reader, final File directory,
193             final String localName, final String charset)
194     throws IllegalArgumentException, IOException {
195         split(reader, directory, localName, charset, defaultPattern);
196     }
197 
198 }