View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.io;
23  
24  import java.io.IOException;
25  import java.io.Writer;
26  import java.util.ArrayList;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.regex.Matcher;
30  import java.util.regex.Pattern;
31  
32  import org.apache.commons.lang.StringUtils;
33  import org.dom4j.Element;
34  
35  import de.fu_berlin.ties.TiesConfiguration;
36  import de.fu_berlin.ties.text.TextUtils;
37  import de.fu_berlin.ties.util.Util;
38  
39  /***
40   * A field container that stores and processed its contents as values separated
41   * by a pipe character: '|' (a different delimiter string can be specified by
42   * changing the "dsv.field.separator" property). Entries are separated by
43   * newlines (or by whitespace if the "dsv.entry.separator.ws" is set to
44   * <code>true</code>).
45   *
46   * <p>The very first non-comment line contains the field names, separated by
47   * pipes. Each further non-comment lines contains the values of a entry
48   * ({@link de.fu_berlin.ties.io.FieldMap}). The first non-comment line must
49   * contain as least as most pipes as each further line, so the field names
50   * are known for all fields of all field maps. Empty fields can be used to
51   * separate {@linkplain de.fu_berlin.ties.io.FieldMap#getSection() sections}.
52   *
53   * <p>The '#' character can be used to introduce comment lines.
54   * Comments and lines that are empty or contain only whitespace are ignored
55   * when reading data.
56   *
57   * <p>Pipe characters and newlines within fields and '#' at the begin of the
58   * first field of an entry are escaped with a backslash.
59   * The recommended character set for storing delimiter-separated values is
60   * <strong>UTF-8</strong>.
61   *
62   * <p>Floats and doubles are formatted (rounded) via
63   * {@link de.fu_berlin.ties.util.Util#format(double)}.
64   *
65   * @author Christian Siefkes
66   * @version $Revision: 1.18 $, $Date: 2006/10/21 16:04:22 $, $Author: siefkes $
67   */
68  public class DelimSepValues extends FieldContainer {
69  
70      /***
71       * The recommended file extension for this format: {@value}
72       * (delimiter-separated values).
73       */
74      public static final String FILE_EXT = "dsv";
75  
76      /***
77       * The character introducing a comment: '#'.
78       */
79      public static final char COMMENT_START = '#';
80  
81      /***
82       * The escape character: '\' (a backslash).
83       */
84      public static final char ESCAPE = '//';
85  
86      /***
87       * The escape character as a regular expression.
88       */
89      private static final Pattern ESCAPE_PATTERN = Pattern.compile("////");
90  
91      /***
92       * String specifying the replacement to use for the
93       * {@link #globalEscapeMatcher}: prepend the escape character to the match.
94       */
95      private static final String GLOBAL_REPLACEMENT =
96          ESCAPE_PATTERN.pattern() + "$0";
97  
98  
99      /***
100      * Helper method that initializes the entry separator from a configuration.
101      *
102      * @param config the configuration to use
103      * @return the configured field separator
104      */
105     private static String initEntrySep(final TiesConfiguration config) {
106         final boolean anyWhitespace =
107             config.getBoolean("dsv.entry.separator.ws");
108 
109         if (anyWhitespace) {
110             // split at whitespace
111             return " ";
112         } else {
113             // split at newlines
114             return TextUtils.LINE_SEPARATOR;
115         }
116     }
117 
118     /***
119      * Helper method that initializes the entry separator pattern.
120      *
121      * @param entrySep the used entry separator
122      * @return the pattern created from quoting the field separator
123      */
124     private static Pattern initEntrySepPattern(final String entrySep) {
125         if (TextUtils.LINE_SEPARATOR.equals(entrySep)) {
126             // line separator: split at newlines
127             return TextUtils.NEWLINE_PATTERN;
128         } else if (" ".equals(entrySep)) {
129             // space: split at any whitespace
130             return TextUtils.WHITESPACE_PATTERN;
131         } else {
132             throw new IllegalArgumentException(
133                 "DSV: entry separator must be a space or newline instead of '"
134                     + entrySep + "'");
135         }
136     }
137 
138     /***
139      * Helper method that initializes the field separator from a configuration.
140      *
141      * @param config the configuration to use
142      * @return the configured field separator
143      */
144     private static String initFieldSep(final TiesConfiguration config) {
145         final String sep = config.getString("dsv.field.separator");
146         if (StringUtils.isNotEmpty(sep)) {
147             return sep;
148         } else {
149             // empty/missing value is converted into a single space
150             return " ";
151         }
152     }
153 
154     /***
155      * Helper method that initializes the field separator pattern.
156      *
157      * @param fieldSep the used field separator
158      * @return the pattern created from quoting the field separator
159      */
160     private static Pattern initFieldSepPattern(final String fieldSep) {
161         return Pattern.compile(Pattern.quote(fieldSep));
162     }
163 
164     /***
165      * Helper method that initializes the list of fixed keys from a
166      * configuration.
167      *
168      * @param config the configuration to use
169      * @return the list of fixed keys; or <code>null</code> if keys should be
170      * read from/written to the first entry
171      */
172     private static String[] initFixedKeys(final TiesConfiguration config) {
173         final String[] result = config.getStringArray("dsv.keys");
174         return TiesConfiguration.arrayIsEmpty(result) ?  null : result;
175     }
176 
177     /***
178      * Helper method that initializes the global escape matcher.
179      *
180      * @param fieldSepPattern the used field separator pattern
181      * @param entrySepPattern the used entry separator pattern
182      * @return the global escape matcher to use
183      */
184     private static Matcher initGlobalEscapeMatcher(
185             final Pattern fieldSepPattern, final Pattern entrySepPattern) {
186         return Pattern.compile("(?:"
187                 + fieldSepPattern.pattern() + '|' + entrySepPattern.pattern()
188                 + ')').matcher("");
189     }
190 
191 
192     /***
193      * The field delimiter character(s).
194      */
195     private final String fieldSeparator;
196 
197     /***
198      * Pattern matching the field delimiter character(s).
199      */
200     private final Pattern fieldSeparatorPattern;
201 
202     /***
203      * The entry delimiter character(s).
204      */
205     private final String entrySeparator;
206 
207     /***
208      * Pattern matching the entry delimiter character(s).
209      */
210     private final Pattern entrySeparatorPattern;
211 
212     /***
213      * Matches strings that must be escaped anywhere in fields
214      * (pipe characters and newlines).
215      */
216     private final Matcher globalEscapeMatcher;
217 
218     /***
219      * Matches strings that must be escaped when writing the begin
220      * of a non-comment line: the comment start character, optionally preceded
221      * by any number of backslashes. Escaped by prepending a backslash.
222      */
223     private final Matcher comstartEscapeMatcher = Pattern.compile(
224         ESCAPE_PATTERN.pattern() + "*" + COMMENT_START).matcher("");
225 
226     /***
227      * Matches strings that must be unescaped when reading the
228      * begin of a non-comment line: the comment start character preceded by one
229      * or more any number of backslashes. Unescaped by removing the first
230      * backslash.
231      */
232     private final Matcher comstartUnescapeMatcher = Pattern.compile(
233         ESCAPE_PATTERN.pattern() + "+" + COMMENT_START).matcher("");
234 
235     /***
236      * An optional fixed list of keys to use. If <code>null</code>, keys are
237      * read from/written to the first line of a DSV file.
238      */
239     private final String[] fixedKeys;
240 
241 
242     /***
243      * Creates a new empty instance, using the
244      * {@linkplain TiesConfiguration#CONF standard configuration}.
245      */
246     public DelimSepValues() {
247         this(TiesConfiguration.CONF);
248     }
249 
250     /***
251      * Creates a new empty instance.
252      *
253      * @param config used to configure this instance
254      */
255     public DelimSepValues(final TiesConfiguration config) {
256         this(initFieldSep(config), initEntrySep(config), initFixedKeys(config));
257     }
258 
259     /***
260      * Creates a new empty instance.
261      * 
262      * @param fieldSep the field separator to use
263      * @param entrySep the entry separator to use
264      * @param fixedHeaderNames an optional fixed list of keys to use; if
265      * <code>null</code>, keys are read from/written to the first line of a
266      * DSV file
267      */
268     public DelimSepValues(final String fieldSep, final String entrySep,
269             final String[] fixedHeaderNames) {
270         super();
271         fieldSeparator = fieldSep;
272         fieldSeparatorPattern = initFieldSepPattern(fieldSeparator);
273         entrySeparator = entrySep;
274         entrySeparatorPattern = initEntrySepPattern(entrySeparator);
275         globalEscapeMatcher = initGlobalEscapeMatcher(fieldSeparatorPattern,
276                 entrySeparatorPattern);
277         fixedKeys = fixedHeaderNames;
278     }
279 
280     /***
281      * Creates a new instance from an XML element, fulfilling the
282      * recommandation of the {@link XMLStorable} interface. Uses the
283      * {@link TiesConfiguration#CONF standard configuration} to configure this
284      * instance.
285      *
286      * @param element the XML element containing the serialized representation
287      */
288     public DelimSepValues(final Element element) {
289         this(element, TiesConfiguration.CONF);
290     }
291 
292     /***
293      * Creates a new instance from an XML element, fulfilling the
294      * recommandation of the {@link XMLStorable} interface.
295      *
296      * @param element the XML element containing the serialized representation
297      * @param config used to configure this instance
298      */
299     public DelimSepValues(final Element element,
300             final TiesConfiguration config) {
301         super(element);
302         fieldSeparator = initFieldSep(config);
303         fieldSeparatorPattern = initFieldSepPattern(fieldSeparator);
304         entrySeparator = initEntrySep(config);
305         entrySeparatorPattern = initEntrySepPattern(entrySeparator);
306         globalEscapeMatcher = initGlobalEscapeMatcher(fieldSeparatorPattern,
307                 entrySeparatorPattern);
308         fixedKeys = initFixedKeys(config);
309     }
310 
311 
312     /***
313      * Reads data as delimiter-separated values.
314      *
315      * @param input the input data to process
316      */
317     public void read(final CharSequence input) {
318         // use fixed keys or read keys from first line if null
319         read(input, fixedKeys);
320 
321 /*        // testing only
322         Util.LOG.info("Entries:");
323         final Iterator<FieldMap> entryIterator = entryIterator();
324         FieldMap entry;
325         while (entryIterator.hasNext()) {
326             entry = entryIterator.next();
327             Util.LOG.info(entry.toString());
328         } */
329     }
330 
331     /***
332      * Reads data as delimiter-separated values, using a specified array of
333      * field names.
334      *
335      * @param input the input data to process
336      * @param keys the array of field names; or <code>null</code> if the field
337      * names should be determined from the first line of the input
338      */
339     public void read(final CharSequence input, final String[] keys) {
340         // section numbering starts with 0 and is increased at each empty line
341         int sectionNo = 0;
342         boolean readFieldNames;
343 
344         if (keys != null) {
345             // field names are already known -- store them
346             readFieldNames = true;
347             for (int i = 0; i < keys.length; i++) {
348                 addKey(keys[i]);
349             }
350         } else {
351             // field names are read from first line of input
352             readFieldNames = false;
353         }
354 
355         // split into entries
356         String[] lines = entrySeparatorPattern.split(input);
357         String entry;
358         String[] orgFields;
359         String currentField;
360         int fIndex;
361         List<String> fieldList;
362 
363         for (int i = 0; i < lines.length; i++) {
364             entry = lines[i];
365 
366             if (StringUtils.isBlank(entry)) {
367                 // section numbering is increased at each empty line
368                 sectionNo++;
369             } else {
370                 // jump over comment lines
371                 if (entry.charAt(0) != COMMENT_START) {
372                     comstartUnescapeMatcher.reset(entry);
373 
374                     if (comstartUnescapeMatcher.lookingAt()) {
375                         // found escaped '#': remove the initial backslash
376                         entry = lines[i].substring(1);
377                     } else {
378                         entry = lines[i];
379                     }
380 
381                     // join with next line if ending in odd number of '\'
382                     while ((TextUtils.countLast(entry, ESCAPE) % 2) == 1) {
383                         i++;
384                         if (i >= lines.length) {
385                             throw new IllegalArgumentException(
386                                     "Premature end of entry: " + entry);
387                         }
388 
389                         // replace last backslash by newline + append next line
390                         entry = entry.substring(0, entry.length() - 1)
391                             + entrySeparator + lines[i];
392                         Util.LOG.debug("Joint entries to handle escape: "
393                                 + entry);
394                     }
395 
396                     // split line on pipes
397                     orgFields = fieldSeparatorPattern.split(entry);
398                     fieldList = new ArrayList<String>(orgFields.length);
399 
400                     // join field with next one if ending in odd number of '\'
401                     // (we know the last one doesn't because we checked above)
402                     for (fIndex = 0; fIndex < orgFields.length; fIndex++) {
403                         currentField = orgFields[fIndex];
404 
405                         while ((TextUtils.countLast(currentField, ESCAPE) % 2)
406                                 == 1) {
407                             fIndex++;
408 
409                             // replace last backslash by '|' + append next field
410                             currentField = currentField.substring(0,
411                                     currentField.length() - 1) + fieldSeparator 
412                                     + orgFields[fIndex];
413                             Util.LOG.debug("Joind fields to handle escape: "
414                                 + currentField);
415                         }
416 
417                         fieldList.add(currentField);
418                     }
419 
420                     if (readFieldNames) {
421                         // create new field map
422                         add(fieldList, sectionNo);
423                     } else {
424                         // first line: field headers
425                         for (fIndex = 0; fIndex < fieldList.size(); fIndex++) {
426                             addKey(fieldList.get(fIndex));
427                         }
428                         readFieldNames = true;
429                     }
430                 }
431             }
432         }
433     }
434 
435     /***
436      * Serializes contents as delimiter-separated values.
437      *
438      * @param writer the writer to write to; flushed but not closed by this
439      * method
440      * @throws IOException if an I/O error occurs while writing to the stream
441      */
442     public void store(final Writer writer) throws IOException {
443         Iterator<String> keyIter;
444 
445         // print headers unless they are fixed or there are no keys
446         if (fixedKeys == null) {
447             keyIter = keyIterator();
448             if (keyIter.hasNext()) {
449                 storeEntry(keyIter, writer);
450             }
451         }
452 
453         // print values of each field map
454         final Iterator mapsIter = entryIterator();
455         FieldMap currentMap;
456         List<Object> currentValues;
457         String currentKey;
458 
459         while (mapsIter.hasNext()) {
460             currentMap = (FieldMap) mapsIter.next();
461 
462             // create list of values (might be null) for each globally known key
463             // using an array list because we know the number of keys to expect
464             currentValues = new ArrayList<Object>(keyCount());
465             keyIter = keyIterator();
466 
467             while (keyIter.hasNext()) {
468                 currentKey = keyIter.next();
469                 // storing value for key (might be null)
470                 currentValues.add(currentMap.get(currentKey));
471             }
472 
473             // remove trailing null values (no need to print unnecessary tabs)
474             while ((currentValues.size() > 0)
475                     && (currentValues.get(currentValues.size() - 1) == null)) {
476                 currentValues.remove(currentValues.size() - 1);
477             }
478 
479             // print values of field map
480             storeEntry(currentValues.iterator(), writer);
481         }
482 
483         writer.flush();
484     }
485 
486     /***
487      * Helper method that serializes an entry as a line of delimiter-separated
488      * values. Tabs are used to separate fields; a newline is printed after
489      * the last fields. Escapes are added as required.
490      * <code>null</code> fields are handled by just adding a tab.
491      *
492      * @param iter an iterator over the fields to add
493      * @param writer the writer to write to
494      * @throws IOException if an I/O error occurs while writing
495      */
496     private void storeEntry(final Iterator iter, final Writer writer)
497             throws IOException {
498         Object rawItem;
499         Number numericItem;
500         String item;
501         boolean isFirst = true;
502 
503         while (iter.hasNext()) {
504             rawItem = iter.next();
505 
506             if (rawItem == null) {
507                 // convert null to empty string
508                 item = "";
509             } else if ((rawItem instanceof Double)
510                     || (rawItem instanceof Float)) {
511                 // use Util method to format/round floats + doubles
512                 numericItem = (Number) rawItem;
513                 item = Util.format(numericItem.doubleValue());
514             } else {
515                 // convert via toString method
516                 item = rawItem.toString();
517             }
518 
519             if (isFirst) {
520                 comstartEscapeMatcher.reset(item);
521 
522                 if (comstartEscapeMatcher.lookingAt()) {
523                     // prepend backslach to escape comment start character
524                     item = ESCAPE + item;
525                 }
526                 isFirst = false;
527             } else {
528                 // write pipe preceding item
529                 writer.write(fieldSeparator);
530             }
531 
532             // no need to bother with an empty string
533             if (item.length() > 0) {
534                 // escape any pipe chars + newlines in item prior to writing
535                 item = TextUtils.replaceAll(item, globalEscapeMatcher,
536                         GLOBAL_REPLACEMENT);
537                 writer.write(item);
538             }
539         }
540 
541         // print ending newline
542         writer.write(entrySeparator);
543     }
544 
545 }