1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.io;
23
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.InputStreamReader;
27 import java.io.Reader;
28 import java.io.Writer;
29 import java.util.ArrayList;
30 import java.util.Iterator;
31 import java.util.List;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34
35 import org.apache.commons.lang.StringUtils;
36
37 import de.fu_berlin.ties.text.TextUtils;
38 import de.fu_berlin.ties.util.Util;
39
40 /***
41 * A field container that stores and processed its contents as values separated
42 * by a pipe character ('|'). Entries are separated by newlines.
43 *
44 * <p>The very first non-comment line contains the field names, separated by
45 * pipes. Each further non-comment lines contains the values of a entry
46 * ({@link de.fu_berlin.ties.io.FieldMap}). The first non-comment line must
47 * contain as least as most pipes as each further line, so the field names
48 * are known for all fields of all field maps. Empty fields are ignored when
49 * populating field maps.
50 *
51 * <p>The '#' character can be used to introduce comment lines.
52 * Comments and lines that are empty or contain only whitespace are ignored
53 * when reading data.
54 *
55 * <p>Pipe characters and newlines within fields and '#' at the begin of the
56 * first field of an entry are escaped with a backslash.
57 * The recommended character set for storing delimiter-separated values is
58 * <strong>UTF-8</strong>.
59 *
60 * <p>Floats and doubles are formatted (rounded) via
61 * {@link de.fu_berlin.ties.util.Util#format(double)}.
62 *
63 * @author Christian Siefkes
64 * @version $Revision: 1.4 $, $Date: 2004/09/06 17:23:31 $, $Author: siefkes $
65 */
66 public class DelimSepValues extends FieldContainer {
67
68 /***
69 * The recommended file extension for this format: {@value}
70 * (delimiter-separated values).
71 */
72 public static final String FILE_EXT = "dsv";
73
74 /***
75 * The character introducing a comment: '#'.
76 */
77 public static final char COMMENT_START = '#';
78
79 /***
80 * The escape character: '\' (a backslash).
81 */
82 public static final char ESCAPE = '//';
83
84 /***
85 * The escape character as a regular expression.
86 */
87 private static final Pattern ESCAPE_PATTERN = Pattern.compile("////");
88
89 /***
90 * The field delimiter character: '|' (a pipe).
91 */
92 public static final char DELIM = '|';
93
94 /***
95 * The field delimiter character as a regular expression.
96 */
97 private static final Pattern DELIM_PATTERN = Pattern.compile("//|");
98
99 /***
100 * String specifying the replacement to use for the
101 * {@link #globalEscapeMatcher}: prepend the escape character to the match.
102 */
103 private static final String GLOBAL_REPLACEMENT =
104 ESCAPE_PATTERN.pattern() + "$0";
105
106 /***
107 * Matches strings that must be escaped anywhere in fields
108 * (pipe characters and newlines).
109 */
110 private final Matcher globalEscapeMatcher = Pattern.compile("(?:"
111 + DELIM_PATTERN.pattern() + '|' + TextUtils.NEWLINE_ALTERNATIVES
112 + ')').matcher("");
113
114 /***
115 * Matches strings that must be escaped when writing the begin
116 * of a non-comment line: the comment start character, optionally preceded
117 * by any number of backslashes. Escaped by prepending a backslash.
118 */
119 private final Matcher comstartEscapeMatcher = Pattern.compile(
120 ESCAPE_PATTERN.pattern() + "*" + COMMENT_START).matcher("");
121
122 /***
123 * Matches strings that must be unescaped when reading the
124 * begin of a non-comment line: the comment start character preceded by one
125 * or more any number of backslashes. Unescaped by removing the first
126 * backslash.
127 */
128 private final Matcher comstartUnescapeMatcher = Pattern.compile(
129 ESCAPE_PATTERN.pattern() + "+" + COMMENT_START).matcher("");
130
131 /***
132 * Creates a new empty instance.
133 */
134 public DelimSepValues() {
135 super();
136 }
137
138 /***
139 * Creates a new instance from serialized delimiter-separated values.
140 *
141 * @param input the input data to process
142 * @throws IllegalArgumentException if the input data contains errors,
143 * e.g. when there are insufficient field names given in the first
144 * non-comment line
145 */
146 public DelimSepValues(final CharSequence input)
147 throws IllegalArgumentException {
148 this();
149
150 String[] lines = TextUtils.splitLinesExact(input);
151 boolean readFieldNames = false;
152 String entry;
153 String[] orgFields;
154 String currentField;
155 int fIndex;
156 List<String> fieldList;
157
158 for (int i = 0; i < lines.length; i++) {
159 entry = lines[i];
160
161
162 if (StringUtils.isNotBlank(entry)
163 && (entry.charAt(0) != COMMENT_START)) {
164 comstartUnescapeMatcher.reset(entry);
165
166 if (comstartUnescapeMatcher.lookingAt()) {
167
168 entry = lines[i].substring(1);
169 } else {
170 entry = lines[i];
171 }
172
173
174 while ((TextUtils.countLast(entry, ESCAPE) % 2) == 1) {
175 i++;
176 if (i >= lines.length) {
177 throw new IllegalArgumentException(
178 "Premature end of entry: " + entry);
179 }
180
181
182 entry = entry.substring(0, entry.length() - 1)
183 + TextUtils.LINE_SEPARATOR + lines[i];
184 Util.LOG.debug("Joint lines to handle escape: " + entry);
185 }
186
187
188 orgFields = DELIM_PATTERN.split(entry);
189 fieldList = new ArrayList<String>(orgFields.length);
190
191
192
193 for (fIndex = 0; fIndex < orgFields.length; fIndex++) {
194 currentField = orgFields[fIndex];
195
196 while ((TextUtils.countLast(currentField, ESCAPE) % 2)
197 == 1) {
198 fIndex++;
199
200
201 currentField = currentField.substring(0,
202 entry.length() - 1) + DELIM + orgFields[fIndex];
203 Util.LOG.debug("Joint fields to handle escape: "
204 + currentField);
205 }
206
207 fieldList.add(currentField);
208 }
209
210 if (readFieldNames) {
211
212 add(fieldList);
213 } else {
214
215 for (fIndex = 0; fIndex < fieldList.size(); fIndex++) {
216 addKey(fieldList.get(fIndex));
217 }
218 readFieldNames = true;
219 }
220 }
221 }
222 }
223
224 /***
225 * Creates a new instance from serialized delimiter-separated values.
226 *
227 * @param in a stream containing the input data to process, must use the
228 * UTF-8 charset; the stream is not closed by this method
229 * @throws IOException if an I/O error occurs while reading from the stream
230 * @throws IllegalArgumentException if the input data contains errors,
231 * esp. when there are insufficient field names given in the first
232 * non-comment line
233 */
234 public DelimSepValues(final InputStream in)
235 throws IOException, IllegalArgumentException {
236
237 this(new InputStreamReader(in, IOUtils.STANDARD_UNICODE_CHARSET));
238 }
239
240 /***
241 * Creates a new instance from serialized delimiter-separated values.
242 *
243 * @param reader a reader containing the input data to process; not closed
244 * by this method
245 * @throws IOException if an I/O error occurs while reading
246 * @throws IllegalArgumentException if the input data contains errors,
247 * esp. when there are insufficient field names given in the first
248 * non-comment line
249 */
250 public DelimSepValues(final Reader reader)
251 throws IOException, IllegalArgumentException {
252 this(IOUtils.readToString(reader));
253 }
254
255 /***
256 * Creates a new instance and populates it from a {@link StorableContainer}.
257 *
258 * @param contents the contents to add by calling
259 * {@link StorableContainer#storeEntries(FieldContainer)}
260 */
261 public DelimSepValues(final StorableContainer contents) {
262 super(contents);
263 }
264
265 /***
266 * Serializes contents as delimiter-separated values.
267 *
268 * @param writer the writer to write to; flushed but not closed by this
269 * method
270 * @throws IOException if an I/O error occurs while writing to the stream
271 */
272 public void store(final Writer writer) throws IOException {
273
274 Iterator<String> keyIter = keyIterator();
275 storeEntry(keyIter, writer);
276
277
278 final Iterator mapsIter = entryIterator();
279 FieldMap currentMap;
280 List<Object> currentValues;
281 String currentKey;
282
283 while (mapsIter.hasNext()) {
284 currentMap = (FieldMap) mapsIter.next();
285
286
287
288 currentValues = new ArrayList<Object>(keyCount());
289 keyIter = keyIterator();
290
291 while (keyIter.hasNext()) {
292 currentKey = keyIter.next();
293
294 currentValues.add(currentMap.get(currentKey));
295 }
296
297
298 while ((currentValues.size() > 0)
299 && (currentValues.get(currentValues.size() - 1) == null)) {
300 currentValues.remove(currentValues.size() - 1);
301 }
302
303
304 storeEntry(currentValues.iterator(), writer);
305 }
306
307 writer.flush();
308 }
309
310 /***
311 * Helper method that serializes an entry as a line of delimiter-separated
312 * values. Tabs are used to separate fields; a newline is printed after
313 * the last fields. Escapes are added as required.
314 * <code>null</code> fields are handled by just adding a tab.
315 *
316 * @param iter an iterator over the fields to add
317 * @param writer the writer to write to
318 * @throws IOException if an I/O error occurs while writing
319 */
320 private void storeEntry(final Iterator iter, final Writer writer)
321 throws IOException {
322 Object rawItem;
323 Number numericItem;
324 String item;
325 boolean isFirst = true;
326
327 while (iter.hasNext()) {
328 rawItem = iter.next();
329
330 if (rawItem == null) {
331
332 item = "";
333 } else if ((rawItem instanceof Double)
334 || (rawItem instanceof Float)) {
335
336 numericItem = (Number) rawItem;
337 item = Util.format(numericItem.doubleValue());
338 } else {
339
340 item = rawItem.toString();
341 }
342
343 if (isFirst) {
344 comstartEscapeMatcher.reset(item);
345
346 if (comstartEscapeMatcher.lookingAt()) {
347
348 item = ESCAPE + item;
349 }
350 isFirst = false;
351 } else {
352
353 writer.write(DELIM);
354 }
355
356
357 if (item.length() > 0) {
358
359 item = TextUtils.replaceAll(item, globalEscapeMatcher,
360 GLOBAL_REPLACEMENT);
361 writer.write(item);
362 }
363 }
364
365
366 writer.write(TextUtils.LINE_SEPARATOR);
367 }
368
369 }