1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.io;
23
24 import java.io.IOException;
25 import java.io.Writer;
26 import java.util.ArrayList;
27 import java.util.Iterator;
28 import java.util.List;
29 import java.util.regex.Matcher;
30 import java.util.regex.Pattern;
31
32 import org.apache.commons.lang.StringUtils;
33 import org.dom4j.Element;
34
35 import de.fu_berlin.ties.TiesConfiguration;
36 import de.fu_berlin.ties.text.TextUtils;
37 import de.fu_berlin.ties.util.Util;
38
39 /***
40 * A field container that stores and processed its contents as values separated
41 * by a pipe character: '|' (a different delimiter string can be specified by
42 * changing the "dsv.field.separator" property). Entries are separated by
43 * newlines (or by whitespace if the "dsv.entry.separator.ws" is set to
44 * <code>true</code>).
45 *
46 * <p>The very first non-comment line contains the field names, separated by
47 * pipes. Each further non-comment lines contains the values of a entry
48 * ({@link de.fu_berlin.ties.io.FieldMap}). The first non-comment line must
49 * contain as least as most pipes as each further line, so the field names
50 * are known for all fields of all field maps. Empty fields can be used to
51 * separate {@linkplain de.fu_berlin.ties.io.FieldMap#getSection() sections}.
52 *
53 * <p>The '#' character can be used to introduce comment lines.
54 * Comments and lines that are empty or contain only whitespace are ignored
55 * when reading data.
56 *
57 * <p>Pipe characters and newlines within fields and '#' at the begin of the
58 * first field of an entry are escaped with a backslash.
59 * The recommended character set for storing delimiter-separated values is
60 * <strong>UTF-8</strong>.
61 *
62 * <p>Floats and doubles are formatted (rounded) via
63 * {@link de.fu_berlin.ties.util.Util#format(double)}.
64 *
65 * @author Christian Siefkes
66 * @version $Revision: 1.18 $, $Date: 2006/10/21 16:04:22 $, $Author: siefkes $
67 */
68 public class DelimSepValues extends FieldContainer {
69
70 /***
71 * The recommended file extension for this format: {@value}
72 * (delimiter-separated values).
73 */
74 public static final String FILE_EXT = "dsv";
75
76 /***
77 * The character introducing a comment: '#'.
78 */
79 public static final char COMMENT_START = '#';
80
81 /***
82 * The escape character: '\' (a backslash).
83 */
84 public static final char ESCAPE = '//';
85
86 /***
87 * The escape character as a regular expression.
88 */
89 private static final Pattern ESCAPE_PATTERN = Pattern.compile("////");
90
91 /***
92 * String specifying the replacement to use for the
93 * {@link #globalEscapeMatcher}: prepend the escape character to the match.
94 */
95 private static final String GLOBAL_REPLACEMENT =
96 ESCAPE_PATTERN.pattern() + "$0";
97
98
99 /***
100 * Helper method that initializes the entry separator from a configuration.
101 *
102 * @param config the configuration to use
103 * @return the configured field separator
104 */
105 private static String initEntrySep(final TiesConfiguration config) {
106 final boolean anyWhitespace =
107 config.getBoolean("dsv.entry.separator.ws");
108
109 if (anyWhitespace) {
110
111 return " ";
112 } else {
113
114 return TextUtils.LINE_SEPARATOR;
115 }
116 }
117
118 /***
119 * Helper method that initializes the entry separator pattern.
120 *
121 * @param entrySep the used entry separator
122 * @return the pattern created from quoting the field separator
123 */
124 private static Pattern initEntrySepPattern(final String entrySep) {
125 if (TextUtils.LINE_SEPARATOR.equals(entrySep)) {
126
127 return TextUtils.NEWLINE_PATTERN;
128 } else if (" ".equals(entrySep)) {
129
130 return TextUtils.WHITESPACE_PATTERN;
131 } else {
132 throw new IllegalArgumentException(
133 "DSV: entry separator must be a space or newline instead of '"
134 + entrySep + "'");
135 }
136 }
137
138 /***
139 * Helper method that initializes the field separator from a configuration.
140 *
141 * @param config the configuration to use
142 * @return the configured field separator
143 */
144 private static String initFieldSep(final TiesConfiguration config) {
145 final String sep = config.getString("dsv.field.separator");
146 if (StringUtils.isNotEmpty(sep)) {
147 return sep;
148 } else {
149
150 return " ";
151 }
152 }
153
154 /***
155 * Helper method that initializes the field separator pattern.
156 *
157 * @param fieldSep the used field separator
158 * @return the pattern created from quoting the field separator
159 */
160 private static Pattern initFieldSepPattern(final String fieldSep) {
161 return Pattern.compile(Pattern.quote(fieldSep));
162 }
163
164 /***
165 * Helper method that initializes the list of fixed keys from a
166 * configuration.
167 *
168 * @param config the configuration to use
169 * @return the list of fixed keys; or <code>null</code> if keys should be
170 * read from/written to the first entry
171 */
172 private static String[] initFixedKeys(final TiesConfiguration config) {
173 final String[] result = config.getStringArray("dsv.keys");
174 return TiesConfiguration.arrayIsEmpty(result) ? null : result;
175 }
176
177 /***
178 * Helper method that initializes the global escape matcher.
179 *
180 * @param fieldSepPattern the used field separator pattern
181 * @param entrySepPattern the used entry separator pattern
182 * @return the global escape matcher to use
183 */
184 private static Matcher initGlobalEscapeMatcher(
185 final Pattern fieldSepPattern, final Pattern entrySepPattern) {
186 return Pattern.compile("(?:"
187 + fieldSepPattern.pattern() + '|' + entrySepPattern.pattern()
188 + ')').matcher("");
189 }
190
191
192 /***
193 * The field delimiter character(s).
194 */
195 private final String fieldSeparator;
196
197 /***
198 * Pattern matching the field delimiter character(s).
199 */
200 private final Pattern fieldSeparatorPattern;
201
202 /***
203 * The entry delimiter character(s).
204 */
205 private final String entrySeparator;
206
207 /***
208 * Pattern matching the entry delimiter character(s).
209 */
210 private final Pattern entrySeparatorPattern;
211
212 /***
213 * Matches strings that must be escaped anywhere in fields
214 * (pipe characters and newlines).
215 */
216 private final Matcher globalEscapeMatcher;
217
218 /***
219 * Matches strings that must be escaped when writing the begin
220 * of a non-comment line: the comment start character, optionally preceded
221 * by any number of backslashes. Escaped by prepending a backslash.
222 */
223 private final Matcher comstartEscapeMatcher = Pattern.compile(
224 ESCAPE_PATTERN.pattern() + "*" + COMMENT_START).matcher("");
225
226 /***
227 * Matches strings that must be unescaped when reading the
228 * begin of a non-comment line: the comment start character preceded by one
229 * or more any number of backslashes. Unescaped by removing the first
230 * backslash.
231 */
232 private final Matcher comstartUnescapeMatcher = Pattern.compile(
233 ESCAPE_PATTERN.pattern() + "+" + COMMENT_START).matcher("");
234
235 /***
236 * An optional fixed list of keys to use. If <code>null</code>, keys are
237 * read from/written to the first line of a DSV file.
238 */
239 private final String[] fixedKeys;
240
241
242 /***
243 * Creates a new empty instance, using the
244 * {@linkplain TiesConfiguration#CONF standard configuration}.
245 */
246 public DelimSepValues() {
247 this(TiesConfiguration.CONF);
248 }
249
250 /***
251 * Creates a new empty instance.
252 *
253 * @param config used to configure this instance
254 */
255 public DelimSepValues(final TiesConfiguration config) {
256 this(initFieldSep(config), initEntrySep(config), initFixedKeys(config));
257 }
258
259 /***
260 * Creates a new empty instance.
261 *
262 * @param fieldSep the field separator to use
263 * @param entrySep the entry separator to use
264 * @param fixedHeaderNames an optional fixed list of keys to use; if
265 * <code>null</code>, keys are read from/written to the first line of a
266 * DSV file
267 */
268 public DelimSepValues(final String fieldSep, final String entrySep,
269 final String[] fixedHeaderNames) {
270 super();
271 fieldSeparator = fieldSep;
272 fieldSeparatorPattern = initFieldSepPattern(fieldSeparator);
273 entrySeparator = entrySep;
274 entrySeparatorPattern = initEntrySepPattern(entrySeparator);
275 globalEscapeMatcher = initGlobalEscapeMatcher(fieldSeparatorPattern,
276 entrySeparatorPattern);
277 fixedKeys = fixedHeaderNames;
278 }
279
280 /***
281 * Creates a new instance from an XML element, fulfilling the
282 * recommandation of the {@link XMLStorable} interface. Uses the
283 * {@link TiesConfiguration#CONF standard configuration} to configure this
284 * instance.
285 *
286 * @param element the XML element containing the serialized representation
287 */
288 public DelimSepValues(final Element element) {
289 this(element, TiesConfiguration.CONF);
290 }
291
292 /***
293 * Creates a new instance from an XML element, fulfilling the
294 * recommandation of the {@link XMLStorable} interface.
295 *
296 * @param element the XML element containing the serialized representation
297 * @param config used to configure this instance
298 */
299 public DelimSepValues(final Element element,
300 final TiesConfiguration config) {
301 super(element);
302 fieldSeparator = initFieldSep(config);
303 fieldSeparatorPattern = initFieldSepPattern(fieldSeparator);
304 entrySeparator = initEntrySep(config);
305 entrySeparatorPattern = initEntrySepPattern(entrySeparator);
306 globalEscapeMatcher = initGlobalEscapeMatcher(fieldSeparatorPattern,
307 entrySeparatorPattern);
308 fixedKeys = initFixedKeys(config);
309 }
310
311
312 /***
313 * Reads data as delimiter-separated values.
314 *
315 * @param input the input data to process
316 */
317 public void read(final CharSequence input) {
318
319 read(input, fixedKeys);
320
321
322
323
324
325
326
327
328
329 }
330
331 /***
332 * Reads data as delimiter-separated values, using a specified array of
333 * field names.
334 *
335 * @param input the input data to process
336 * @param keys the array of field names; or <code>null</code> if the field
337 * names should be determined from the first line of the input
338 */
339 public void read(final CharSequence input, final String[] keys) {
340
341 int sectionNo = 0;
342 boolean readFieldNames;
343
344 if (keys != null) {
345
346 readFieldNames = true;
347 for (int i = 0; i < keys.length; i++) {
348 addKey(keys[i]);
349 }
350 } else {
351
352 readFieldNames = false;
353 }
354
355
356 String[] lines = entrySeparatorPattern.split(input);
357 String entry;
358 String[] orgFields;
359 String currentField;
360 int fIndex;
361 List<String> fieldList;
362
363 for (int i = 0; i < lines.length; i++) {
364 entry = lines[i];
365
366 if (StringUtils.isBlank(entry)) {
367
368 sectionNo++;
369 } else {
370
371 if (entry.charAt(0) != COMMENT_START) {
372 comstartUnescapeMatcher.reset(entry);
373
374 if (comstartUnescapeMatcher.lookingAt()) {
375
376 entry = lines[i].substring(1);
377 } else {
378 entry = lines[i];
379 }
380
381
382 while ((TextUtils.countLast(entry, ESCAPE) % 2) == 1) {
383 i++;
384 if (i >= lines.length) {
385 throw new IllegalArgumentException(
386 "Premature end of entry: " + entry);
387 }
388
389
390 entry = entry.substring(0, entry.length() - 1)
391 + entrySeparator + lines[i];
392 Util.LOG.debug("Joint entries to handle escape: "
393 + entry);
394 }
395
396
397 orgFields = fieldSeparatorPattern.split(entry);
398 fieldList = new ArrayList<String>(orgFields.length);
399
400
401
402 for (fIndex = 0; fIndex < orgFields.length; fIndex++) {
403 currentField = orgFields[fIndex];
404
405 while ((TextUtils.countLast(currentField, ESCAPE) % 2)
406 == 1) {
407 fIndex++;
408
409
410 currentField = currentField.substring(0,
411 currentField.length() - 1) + fieldSeparator
412 + orgFields[fIndex];
413 Util.LOG.debug("Joind fields to handle escape: "
414 + currentField);
415 }
416
417 fieldList.add(currentField);
418 }
419
420 if (readFieldNames) {
421
422 add(fieldList, sectionNo);
423 } else {
424
425 for (fIndex = 0; fIndex < fieldList.size(); fIndex++) {
426 addKey(fieldList.get(fIndex));
427 }
428 readFieldNames = true;
429 }
430 }
431 }
432 }
433 }
434
435 /***
436 * Serializes contents as delimiter-separated values.
437 *
438 * @param writer the writer to write to; flushed but not closed by this
439 * method
440 * @throws IOException if an I/O error occurs while writing to the stream
441 */
442 public void store(final Writer writer) throws IOException {
443 Iterator<String> keyIter;
444
445
446 if (fixedKeys == null) {
447 keyIter = keyIterator();
448 if (keyIter.hasNext()) {
449 storeEntry(keyIter, writer);
450 }
451 }
452
453
454 final Iterator mapsIter = entryIterator();
455 FieldMap currentMap;
456 List<Object> currentValues;
457 String currentKey;
458
459 while (mapsIter.hasNext()) {
460 currentMap = (FieldMap) mapsIter.next();
461
462
463
464 currentValues = new ArrayList<Object>(keyCount());
465 keyIter = keyIterator();
466
467 while (keyIter.hasNext()) {
468 currentKey = keyIter.next();
469
470 currentValues.add(currentMap.get(currentKey));
471 }
472
473
474 while ((currentValues.size() > 0)
475 && (currentValues.get(currentValues.size() - 1) == null)) {
476 currentValues.remove(currentValues.size() - 1);
477 }
478
479
480 storeEntry(currentValues.iterator(), writer);
481 }
482
483 writer.flush();
484 }
485
486 /***
487 * Helper method that serializes an entry as a line of delimiter-separated
488 * values. Tabs are used to separate fields; a newline is printed after
489 * the last fields. Escapes are added as required.
490 * <code>null</code> fields are handled by just adding a tab.
491 *
492 * @param iter an iterator over the fields to add
493 * @param writer the writer to write to
494 * @throws IOException if an I/O error occurs while writing
495 */
496 private void storeEntry(final Iterator iter, final Writer writer)
497 throws IOException {
498 Object rawItem;
499 Number numericItem;
500 String item;
501 boolean isFirst = true;
502
503 while (iter.hasNext()) {
504 rawItem = iter.next();
505
506 if (rawItem == null) {
507
508 item = "";
509 } else if ((rawItem instanceof Double)
510 || (rawItem instanceof Float)) {
511
512 numericItem = (Number) rawItem;
513 item = Util.format(numericItem.doubleValue());
514 } else {
515
516 item = rawItem.toString();
517 }
518
519 if (isFirst) {
520 comstartEscapeMatcher.reset(item);
521
522 if (comstartEscapeMatcher.lookingAt()) {
523
524 item = ESCAPE + item;
525 }
526 isFirst = false;
527 } else {
528
529 writer.write(fieldSeparator);
530 }
531
532
533 if (item.length() > 0) {
534
535 item = TextUtils.replaceAll(item, globalEscapeMatcher,
536 GLOBAL_REPLACEMENT);
537 writer.write(item);
538 }
539 }
540
541
542 writer.write(entrySeparator);
543 }
544
545 }