View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.io;
23  
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.io.InputStreamReader;
27  import java.io.Reader;
28  import java.io.Writer;
29  import java.util.ArrayList;
30  import java.util.Iterator;
31  import java.util.List;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  import org.apache.commons.lang.StringUtils;
36  
37  import de.fu_berlin.ties.text.TextUtils;
38  import de.fu_berlin.ties.util.Util;
39  
40  /***
41   * A field container that stores and processed its contents as values separated
42   * by a pipe character ('|'). Entries are separated by newlines.
43   *
44   * <p>The very first non-comment line contains the field names, separated by
45   * pipes. Each further non-comment lines contains the values of a entry
46   * ({@link de.fu_berlin.ties.io.FieldMap}). The first non-comment line must
47   * contain as least as most pipes as each further line, so the field names
48   * are known for all fields of all field maps. Empty fields are ignored when
49   * populating field maps.
50   *
51   * <p>The '#' character can be used to introduce comment lines.
52   * Comments and lines that are empty or contain only whitespace are ignored
53   * when reading data.
54   *
55   * <p>Pipe characters and newlines within fields and '#' at the begin of the
56   * first field of an entry are escaped with a backslash.
57   * The recommended character set for storing delimiter-separated values is
58   * <strong>UTF-8</strong>.
59   *
60   * <p>Floats and doubles are formatted (rounded) via
61   * {@link de.fu_berlin.ties.util.Util#format(double)}.
62   *
63   * @author Christian Siefkes
64   * @version $Revision: 1.4 $, $Date: 2004/09/06 17:23:31 $, $Author: siefkes $
65   */
66  public class DelimSepValues extends FieldContainer {
67  
68      /***
69       * The recommended file extension for this format: {@value}
70       * (delimiter-separated values).
71       */
72      public static final String FILE_EXT = "dsv";
73  
74      /***
75       * The character introducing a comment: '#'.
76       */
77      public static final char COMMENT_START = '#';
78  
79      /***
80       * The escape character: '\' (a backslash).
81       */
82      public static final char ESCAPE = '//';
83  
84      /***
85       * The escape character as a regular expression.
86       */
87      private static final Pattern ESCAPE_PATTERN = Pattern.compile("////");
88  
89      /***
90       * The field delimiter character: '|' (a pipe).
91       */
92      public static final char DELIM = '|';
93  
94      /***
95       * The field delimiter character as a regular expression.
96       */
97      private static final Pattern DELIM_PATTERN = Pattern.compile("//|");
98  
99      /***
100      * String specifying the replacement to use for the
101      * {@link #globalEscapeMatcher}: prepend the escape character to the match.
102      */
103     private static final String GLOBAL_REPLACEMENT =
104         ESCAPE_PATTERN.pattern() + "$0";
105 
106     /***
107      * Matches strings that must be escaped anywhere in fields
108      * (pipe characters and newlines).
109      */
110     private final Matcher globalEscapeMatcher = Pattern.compile("(?:"
111             + DELIM_PATTERN.pattern() + '|' + TextUtils.NEWLINE_ALTERNATIVES
112             + ')').matcher("");
113 
114     /***
115      * Matches strings that must be escaped when writing the begin
116      * of a non-comment line: the comment start character, optionally preceded
117      * by any number of backslashes. Escaped by prepending a backslash.
118      */
119     private final Matcher comstartEscapeMatcher = Pattern.compile(
120         ESCAPE_PATTERN.pattern() + "*" + COMMENT_START).matcher("");
121 
122     /***
123      * Matches strings that must be unescaped when reading the
124      * begin of a non-comment line: the comment start character preceded by one
125      * or more any number of backslashes. Unescaped by removing the first
126      * backslash.
127      */
128     private final Matcher comstartUnescapeMatcher = Pattern.compile(
129         ESCAPE_PATTERN.pattern() + "+" + COMMENT_START).matcher("");
130 
131     /***
132      * Creates a new empty instance.
133      */
134     public DelimSepValues() {
135         super();
136     }
137 
138     /***
139      * Creates a new instance from serialized delimiter-separated values.
140      *
141      * @param input the input data to process
142      * @throws IllegalArgumentException if the input data contains errors,
143      * e.g. when there are insufficient field names given in the first
144      * non-comment line
145      */
146     public DelimSepValues(final CharSequence input)
147             throws IllegalArgumentException {
148         this();
149         // split into into lines
150         String[] lines = TextUtils.splitLinesExact(input);
151         boolean readFieldNames = false;
152         String entry;
153         String[] orgFields;
154         String currentField;
155         int fIndex;
156         List<String> fieldList;
157 
158         for (int i = 0; i < lines.length; i++) {
159             entry = lines[i];
160 
161             // jump over comment lines and blank lines
162             if (StringUtils.isNotBlank(entry)
163                     && (entry.charAt(0) != COMMENT_START)) {
164                 comstartUnescapeMatcher.reset(entry);
165 
166                 if (comstartUnescapeMatcher.lookingAt()) {
167                     // found escaped '#': remove the initial backslash
168                     entry = lines[i].substring(1);
169                 } else {
170                     entry = lines[i];
171                 }
172 
173                 // join with next line if ending in odd number of '\'
174                 while ((TextUtils.countLast(entry, ESCAPE) % 2) == 1) {
175                     i++;
176                     if (i >= lines.length) {
177                         throw new IllegalArgumentException(
178                                 "Premature end of entry: " + entry);
179                     }
180 
181                     // replace last backslash by newline + append next line
182                     entry = entry.substring(0, entry.length() - 1)
183                         + TextUtils.LINE_SEPARATOR + lines[i];
184                     Util.LOG.debug("Joint lines to handle escape: " + entry);
185                 }
186 
187                 // split line on pipes
188                 orgFields = DELIM_PATTERN.split(entry);
189                 fieldList = new ArrayList<String>(orgFields.length);
190 
191                 // join field with next one if ending in odd number of '\'
192                 // (we know the last one doesn't because we checked above)
193                 for (fIndex = 0; fIndex < orgFields.length; fIndex++) {
194                     currentField = orgFields[fIndex];
195 
196                     while ((TextUtils.countLast(currentField, ESCAPE) % 2)
197                             == 1) {
198                         fIndex++;
199 
200                         // replace last backslash by '|' + append next field
201                         currentField = currentField.substring(0,
202                                 entry.length() - 1) + DELIM + orgFields[fIndex];
203                         Util.LOG.debug("Joint fields to handle escape: "
204                             + currentField);
205                     }
206 
207                     fieldList.add(currentField);
208                 }
209 
210                 if (readFieldNames) {
211                     // create new field map
212                     add(fieldList);
213                 } else {
214                     // first line: field headers
215                     for (fIndex = 0; fIndex < fieldList.size(); fIndex++) {
216                         addKey(fieldList.get(fIndex));
217                     }
218                     readFieldNames = true;
219                 }
220             }
221         }
222     }
223 
224     /***
225      * Creates a new instance from serialized delimiter-separated values.
226      *
227      * @param in a stream containing the input data to process, must use the
228      * UTF-8 charset; the stream is not closed by this method
229      * @throws IOException if an I/O error occurs while reading from the stream
230      * @throws IllegalArgumentException if the input data contains errors,
231      * esp. when there are insufficient field names given in the first
232      * non-comment line
233      */
234     public DelimSepValues(final InputStream in)
235             throws IOException, IllegalArgumentException {
236         // read data with UTF-8 charset
237         this(new InputStreamReader(in, IOUtils.STANDARD_UNICODE_CHARSET));
238     }
239 
240     /***
241      * Creates a new instance from serialized delimiter-separated values.
242      *
243      * @param reader a reader containing the input data to process; not closed
244      * by this method
245      * @throws IOException if an I/O error occurs while reading
246      * @throws IllegalArgumentException if the input data contains errors,
247      * esp. when there are insufficient field names given in the first
248      * non-comment line
249      */
250     public DelimSepValues(final Reader reader)
251             throws IOException, IllegalArgumentException {
252         this(IOUtils.readToString(reader));
253     }
254 
255     /***
256      * Creates a new instance and populates it from a {@link StorableContainer}.
257      *
258      * @param contents the contents to add by calling
259      * {@link StorableContainer#storeEntries(FieldContainer)}
260      */
261     public DelimSepValues(final StorableContainer contents) {
262         super(contents);
263     }
264 
265     /***
266      * Serializes contents as delimiter-separated values.
267      *
268      * @param writer the writer to write to; flushed but not closed by this
269      * method
270      * @throws IOException if an I/O error occurs while writing to the stream
271      */
272     public void store(final Writer writer) throws IOException {
273         // print headers
274         Iterator<String> keyIter = keyIterator();
275         storeEntry(keyIter, writer);
276 
277         // print values of each field map
278         final Iterator mapsIter = entryIterator();
279         FieldMap currentMap;
280         List<Object> currentValues;
281         String currentKey;
282 
283         while (mapsIter.hasNext()) {
284             currentMap = (FieldMap) mapsIter.next();
285 
286             // create list of values (might be null) for each globally known key
287             // using an array list because we know the number of keys to expect
288             currentValues = new ArrayList<Object>(keyCount());
289             keyIter = keyIterator();
290 
291             while (keyIter.hasNext()) {
292                 currentKey = keyIter.next();
293                 // storing value for key (might be null)
294                 currentValues.add(currentMap.get(currentKey));
295             }
296 
297             // remove trailing null values (no need to print unnecessary tabs)
298             while ((currentValues.size() > 0)
299                     && (currentValues.get(currentValues.size() - 1) == null)) {
300                 currentValues.remove(currentValues.size() - 1);
301             }
302 
303             // print values of field map
304             storeEntry(currentValues.iterator(), writer);
305         }
306 
307         writer.flush();
308     }
309 
310     /***
311      * Helper method that serializes an entry as a line of delimiter-separated
312      * values. Tabs are used to separate fields; a newline is printed after
313      * the last fields. Escapes are added as required.
314      * <code>null</code> fields are handled by just adding a tab.
315      *
316      * @param iter an iterator over the fields to add
317      * @param writer the writer to write to
318      * @throws IOException if an I/O error occurs while writing
319      */
320     private void storeEntry(final Iterator iter, final Writer writer)
321             throws IOException {
322         Object rawItem;
323         Number numericItem;
324         String item;
325         boolean isFirst = true;
326 
327         while (iter.hasNext()) {
328             rawItem = iter.next();
329 
330             if (rawItem == null) {
331                 // convert null to empty string
332                 item = "";
333             } else if ((rawItem instanceof Double)
334                     || (rawItem instanceof Float)) {
335                 // use Util method to format/round floats + doubles
336                 numericItem = (Number) rawItem;
337                 item = Util.format(numericItem.doubleValue());
338             } else {
339                 // convert via toString method
340                 item = rawItem.toString();
341             }
342 
343             if (isFirst) {
344                 comstartEscapeMatcher.reset(item);
345 
346                 if (comstartEscapeMatcher.lookingAt()) {
347                     // prepend backslach to escape comment start character
348                     item = ESCAPE + item;
349                 }
350                 isFirst = false;
351             } else {
352                 // write pipe preceding item
353                 writer.write(DELIM);
354             }
355 
356             // no need to bother with an empty string
357             if (item.length() > 0) {
358                 // escape any pipe chars + newlines in item prior to writing
359                 item = TextUtils.replaceAll(item, globalEscapeMatcher,
360                         GLOBAL_REPLACEMENT);
361                 writer.write(item);
362             }
363         }
364 
365         // print ending newline
366         writer.write(TextUtils.LINE_SEPARATOR);
367     }
368 
369 }