1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.eval;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.io.Writer;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.Iterator;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.SortedSet;
34 import java.util.TreeSet;
35
36 import org.apache.commons.collections.Bag;
37 import org.apache.commons.collections.bag.HashBag;
38
39 import de.fu_berlin.ties.Closeable;
40 import de.fu_berlin.ties.ContextMap;
41 import de.fu_berlin.ties.ProcessingException;
42 import de.fu_berlin.ties.TextProcessor;
43 import de.fu_berlin.ties.TiesConfiguration;
44 import de.fu_berlin.ties.classify.Prediction;
45 import de.fu_berlin.ties.extract.Extraction;
46 import de.fu_berlin.ties.extract.ExtractionContainer;
47 import de.fu_berlin.ties.extract.TargetStructure;
48 import de.fu_berlin.ties.io.FieldContainer;
49 import de.fu_berlin.ties.io.FieldMap;
50 import de.fu_berlin.ties.io.IOUtils;
51 import de.fu_berlin.ties.text.TextTokenizer;
52 import de.fu_berlin.ties.text.TokenizerFactory;
53 import de.fu_berlin.ties.util.Util;
54
55 /***
56 * A simple goal that reads a list of
57 * {@link de.fu_berlin.ties.extract.EvaluatedExtractionContainer}s and
58 * calculates the average length (in characters and tokens) for extractions of
59 * of all types (e.g. speaker, location etc.) and all
60 * {@link de.fu_berlin.ties.eval.EvalStatus evaluation statuses} (e.g. correct,
61 * missing etc.)
62 *
63 * <p>Instances of this type are <em>not</em> thread-safe.
64 *
65 * @author Christian Siefkes
66 * @version $Revision: 1.8 $, $Date: 2006/10/21 16:04:11 $, $Author: siefkes $
67 */
68 public class AverageLength extends TextProcessor implements Closeable {
69
70 /***
71 * The key used by the {@link #metricsByLength()} method to serialize the
72 * token lengths.
73 */
74 public static final String KEY_TOKEN_LENGTH = "TokenLength";
75
76 /***
77 * The set of all extraction types.
78 */
79 private SortedSet<String> typeNames = null;
80
81 /***
82 * A mapping from each extraction type to F/Precision/Recall statistics
83 * calculated separately for each token length.
84 */
85 private List<MultiFMetrics> avgMetrics = new ArrayList<MultiFMetrics>();
86
87 /***
88 * Counts the number of extractions for each extraction type/status
89 * combination.
90 */
91 private final Bag extBag = new HashBag();
92
93 /***
94 * Counts the number of characters for each extraction type/status
95 * combination.
96 */
97 private final Bag charBag = new HashBag();
98
99 /***
100 * Counts the number of tokens for each extraction type/status combination.
101 */
102 private final Bag tokenBag = new HashBag();
103
104 /***
105 * The tokenizer used to split extractions into tokens.
106 */
107 private final TextTokenizer tokenizer;
108
109
110 /***
111 * Creates a new instance, using a default extension and the
112 * {@linkplain TiesConfiguration#CONF standard configuration}.
113 */
114 public AverageLength() {
115 this("avl");
116 }
117
118 /***
119 * Creates a new instance, using the
120 * {@linkplain TiesConfiguration#CONF standard configuration}.
121 *
122 * @param outExt the extension to use for output files
123 */
124 public AverageLength(final String outExt) {
125 this(outExt, TiesConfiguration.CONF);
126 }
127
128 /***
129 * Creates a new instance.
130 *
131 * @param outExt the extension to use for output files
132 * @param conf the configuration to use
133 */
134 public AverageLength(final String outExt, final TiesConfiguration conf) {
135 super(outExt, conf);
136 tokenizer = new TokenizerFactory(conf).createTokenizer("");
137 }
138
139 /***
140 * Calculates the average length (in visible characters and tokens)
141 * for all extractions of all types and all evaluation statuses
142 * processed do far. Requires at least one previous call to one of the
143 * {@link #updateAverageLengths(ExtractionContainer) updateAverageLengths}
144 * methods -- otherwise there is nothing to calculate.
145 *
146 * @return an array of two field containers containing the average character
147 * counts (first container) and average token counts (second container)
148 * in a two-dimensional matrix
149 * @throws IllegalStateException if no update method has been invoked
150 */
151 public FieldContainer[] calculateAverageLengths()
152 throws IllegalStateException {
153 final FieldContainer charContainer =
154 FieldContainer.createFieldContainer(getConfig());
155 final FieldContainer tokenContainer =
156 FieldContainer.createFieldContainer(getConfig());
157 final SortedSet<String> statusSet =
158 new TreeSet<String>(EvalStatus.allInstanceStrings());
159
160
161 if (typeNames == null) {
162 throw new IllegalStateException(
163 "calculateAverageLengths without prior updateAverageLengths");
164 }
165
166 final Iterator<String> typeIter = typeNames.iterator();
167 Iterator<String> statusIter;
168 String type, statusName, key;
169 FieldMap charFields, tokenFields;
170 int extCount;
171 double charAvg, tokenAvg;
172
173 while (typeIter.hasNext()) {
174 type = typeIter.next();
175 statusIter = statusSet.iterator();
176 charFields = new FieldMap();
177 tokenFields = new FieldMap();
178 charFields.put(Prediction.KEY_TYPE, type);
179 tokenFields.put(Prediction.KEY_TYPE, type);
180
181 while (statusIter.hasNext()) {
182 statusName = statusIter.next();
183 key = key(type, statusName);
184 extCount = extBag.getCount(key);
185
186
187 if (extCount > 0) {
188 charAvg = (double) charBag.getCount(key) / extCount;
189 tokenAvg = (double) tokenBag.getCount(key) / extCount;
190 charFields.put(statusName, charAvg);
191 tokenFields.put(statusName, tokenAvg);
192 }
193 }
194
195 charContainer.add(charFields);
196 tokenContainer.add(tokenFields);
197 }
198 return new FieldContainer[] {charContainer, tokenContainer};
199 }
200
201 /***
202 * Analyzes an extraction container, updating the average
203 * lengths for extractions of all types and all evaluation statuses.
204 *
205 * @param extractions the container of evaluated extractions
206 */
207 public void updateAverageLengths(final ExtractionContainer extractions) {
208
209 if (typeNames == null) {
210 typeNames = new TreeSet<String>(
211 extractions.getTargetStructure().getClassNames());
212 }
213
214 final Iterator<Extraction> extIter = extractions.iterator();
215 Extraction ext;
216 String type, text, key;
217 EvalStatus status;
218 int numTokens, oldTokenCount, index;
219
220 while (extIter.hasNext()) {
221 ext = extIter.next();
222 type = ext.getType();
223 text = ext.getText();
224 status = ext.getEvalStatus();
225 key = key(type, status);
226
227
228 extBag.add(key);
229 charBag.add(key, text.length());
230
231
232 numTokens = countTokens(text);
233 oldTokenCount = tokenBag.getCount(key);
234 tokenBag.add(key, numTokens);
235
236 if (tokenBag.getCount(key) <= oldTokenCount) {
237 Util.LOG.error("Numerical overflow in token count for "
238 + key + ": " + oldTokenCount + " -> "
239 + tokenBag.getCount(key));
240 }
241
242
243 index = numTokens - 1;
244 MultiFMetrics metrics;
245
246 while (avgMetrics.size() < index) {
247
248 avgMetrics.add(null);
249 }
250
251
252 if (avgMetrics.size() == index) {
253 metrics = new MultiFMetrics();
254 avgMetrics.add(metrics);
255 } else {
256 metrics = avgMetrics.get(index);
257 if (metrics == null) {
258 metrics = new MultiFMetrics();
259 avgMetrics.set(index, metrics);
260 }
261 }
262
263 if (status == EvalStatus.CORRECT) {
264 metrics.incTruePos(type);
265 } else if (status == EvalStatus.MISSING) {
266 metrics.incFalseNeg(type);
267 } else if (status == EvalStatus.SPURIOUS) {
268 metrics.incFalsePos(type);
269 }
270 }
271 }
272
273 /***
274 * Analyzes the serialized contents of an extraction container,
275 * delegating to {@link #updateAverageLengths(ExtractionContainer)}.
276 *
277 * @param reader reader containg the extractions to analyse in
278 * {@link de.fu_berlin.ties.io.DelimSepValues} format; not closed by this
279 * method
280 * @throws IOException if an I/O error occurs while reading the extractions
281 */
282 public void updateAverageLengths(final Reader reader) throws IOException {
283
284 final FieldContainer fContainer =
285 FieldContainer.createFieldContainer(getConfig());
286 fContainer.read(reader);
287 final ExtractionContainer extraction = new ExtractionContainer(
288 new TargetStructure(getConfig()), fContainer);
289 updateAverageLengths(extraction);
290 }
291
292 /***
293 * {@inheritDoc}
294 */
295 public void close(final int errorCount) throws IOException {
296 if (errorCount == 0) {
297
298 FieldContainer[] containers = calculateAverageLengths();
299 Map<String, FieldContainer> metricsByLength = metricsByLength();
300 final String basename = "average";
301
302
303 final File outDir = IOUtils.determineOutputDirectory(getConfig());
304 containers[0].storeInFile(outDir, basename, "chars", getConfig());
305 containers[1].storeInFile(outDir, basename, "tokens", getConfig());
306
307 for (String metrics : metricsByLength.keySet()) {
308 metricsByLength.get(metrics).storeInFile(outDir, basename,
309 metrics.toLowerCase(), getConfig());
310 }
311
312 Util.LOG.info("Stored average character counts and token counts "
313 + " and metrics-by-length in " + basename + "*files");
314 }
315 }
316
317 /***
318 * Counts the tokens in a text.
319 *
320 * @param text the text to process
321 * @return the number of tokens in the text
322 */
323 private int countTokens(final String text) {
324 tokenizer.reset(text);
325 int result = 0;
326
327 while (tokenizer.nextToken() != null) {
328 result++;
329 }
330
331 return result;
332 }
333
334 /***
335 * {@inheritDoc}
336 */
337 protected void doProcess(final Reader reader, final Writer writer,
338 final ContextMap context) throws IOException, ProcessingException {
339
340 updateAverageLengths(reader);
341 }
342
343 /***
344 * Generates a key string combining type and status of an extraction.
345 *
346 * @param type the type of the extraction
347 * @param status the evaluation status of the extraction
348 * @return a combined string representation
349 */
350 private String key(final String type, final EvalStatus status) {
351 return key(type, status.getName());
352 }
353
354 /***
355 * Generates a key string combining type and status of an extraction.
356 *
357 * @param type the type of the extraction
358 * @param statusString string representing the evaluation status
359 * @return a combined string representation
360 */
361 private String key(final String type, final String statusString) {
362
363 return type + ' ' + statusString;
364 }
365
366 /***
367 * Returns the usual {@link FMetrics metrics} F-measure, precision and
368 * recall, calculated separately for all extractions of the same type (as
369 * usual) <em>and token length</em>. The return map contains the names of
370 * the three metrics as keys and an 2-dimensional representation of their
371 * values, indexed by extraction types as column names and token lengths
372 * as row names. In a fourth container, the number of answer keys
373 * (expected extractions) is returned.
374 *
375 * @return a mapping from metrics to field containers as described above
376 * @throws IllegalStateException if no update method has been invoked
377 */
378 public Map<String, FieldContainer> metricsByLength()
379 throws IllegalStateException {
380
381 if (typeNames == null) {
382 throw new IllegalStateException(
383 "calculateAverageLengths without prior updateAverageLengths");
384 }
385
386 final FieldContainer fMeasure =
387 FieldContainer.createFieldContainer(getConfig());
388 final FieldContainer precision =
389 FieldContainer.createFieldContainer(getConfig());
390 final FieldContainer recall =
391 FieldContainer.createFieldContainer(getConfig());
392 final FieldContainer answerKeys =
393 FieldContainer.createFieldContainer(getConfig());
394 FieldMap fMap, pMap, rMap, ansMap;
395
396 final int maxLength = avgMetrics.size();
397 MultiFMetrics multiMetrics;
398 FMetricsView metricsView;
399 Iterator<String> typeIter;
400 String extractionType;
401 long expectedAnswers;
402
403 for (int i = 0; i < maxLength;) {
404 multiMetrics = avgMetrics.get(i++);
405
406 if (multiMetrics != null) {
407
408 fMap = new FieldMap(KEY_TOKEN_LENGTH, i);
409 pMap = new FieldMap(KEY_TOKEN_LENGTH, i);
410 rMap = new FieldMap(KEY_TOKEN_LENGTH, i);
411 ansMap = new FieldMap(KEY_TOKEN_LENGTH, i);
412
413
414 typeIter = typeNames.iterator();
415 while (typeIter.hasNext()) {
416 extractionType = typeIter.next();
417 metricsView = multiMetrics.view(extractionType);
418
419 if (metricsView != null) {
420 fMap.put(extractionType, metricsView.getF1Measure());
421 pMap.put(extractionType, metricsView.getPrecision());
422 rMap.put(extractionType, metricsView.getRecall());
423 expectedAnswers = metricsView.getTruePos()
424 + metricsView.getFalseNeg();
425 ansMap.put(extractionType, expectedAnswers);
426 }
427 }
428
429
430 fMeasure.add(fMap);
431 precision.add(pMap);
432 recall.add(rMap);
433 answerKeys.add(ansMap);
434 } else {
435 Util.LOG.debug("No metrics found for token length " + i);
436 }
437 }
438
439
440 final Map<String, FieldContainer> result =
441 new HashMap<String, FieldContainer>();
442 result.put(FMetrics.KEY_F1_MEASURE, fMeasure);
443 result.put(FMetrics.KEY_PRECISION, precision);
444 result.put(FMetrics.KEY_RECALL, recall);
445 result.put("AnswerKeys", answerKeys);
446 return result;
447 }
448
449 }