View Javadoc

1   /*
2    * Copyright (C) 2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.eval;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Reader;
27  import java.io.Writer;
28  import java.util.ArrayList;
29  import java.util.HashMap;
30  import java.util.Iterator;
31  import java.util.List;
32  import java.util.Map;
33  import java.util.SortedSet;
34  import java.util.TreeSet;
35  
36  import org.apache.commons.collections.Bag;
37  import org.apache.commons.collections.bag.HashBag;
38  
39  import de.fu_berlin.ties.Closeable;
40  import de.fu_berlin.ties.ContextMap;
41  import de.fu_berlin.ties.ProcessingException;
42  import de.fu_berlin.ties.TextProcessor;
43  import de.fu_berlin.ties.TiesConfiguration;
44  import de.fu_berlin.ties.classify.Prediction;
45  import de.fu_berlin.ties.extract.Extraction;
46  import de.fu_berlin.ties.extract.ExtractionContainer;
47  import de.fu_berlin.ties.extract.TargetStructure;
48  import de.fu_berlin.ties.io.FieldContainer;
49  import de.fu_berlin.ties.io.FieldMap;
50  import de.fu_berlin.ties.io.IOUtils;
51  import de.fu_berlin.ties.text.TextTokenizer;
52  import de.fu_berlin.ties.text.TokenizerFactory;
53  import de.fu_berlin.ties.util.Util;
54  
55  /***
56   * A simple goal that reads a list of
57   * {@link de.fu_berlin.ties.extract.EvaluatedExtractionContainer}s and
58   * calculates the average length (in characters and tokens) for extractions of
59   * of all types (e.g. speaker, location etc.) and all
60   * {@link de.fu_berlin.ties.eval.EvalStatus evaluation statuses} (e.g. correct,
61   * missing etc.)
62   *
63   * <p>Instances of this type are <em>not</em> thread-safe.
64   *
65   * @author Christian Siefkes
66   * @version $Revision: 1.8 $, $Date: 2006/10/21 16:04:11 $, $Author: siefkes $
67   */
68  public class AverageLength extends TextProcessor implements Closeable {
69  
70      /***
71       * The key used by the {@link #metricsByLength()} method to serialize the
72       * token lengths.
73       */
74      public static final String KEY_TOKEN_LENGTH = "TokenLength";
75  
76      /***
77       * The set of all extraction types.
78       */
79      private SortedSet<String> typeNames = null;
80  
81      /***
82       * A mapping from each extraction type to F/Precision/Recall statistics
83       * calculated separately for each token length.
84       */
85      private List<MultiFMetrics> avgMetrics = new ArrayList<MultiFMetrics>();
86  
87      /***
88       * Counts the number of extractions for each extraction type/status
89       * combination.
90       */
91      private final Bag extBag = new HashBag();
92  
93      /***
94       * Counts the number of characters for each extraction type/status
95       * combination.
96       */
97      private final Bag charBag = new HashBag();
98  
99      /***
100      * Counts the number of tokens for each extraction type/status combination.
101      */
102     private final Bag tokenBag = new HashBag();
103 
104     /***
105      * The tokenizer used to split extractions into tokens.
106      */
107     private final TextTokenizer tokenizer;
108 
109 
110     /***
111      * Creates a new instance, using a default extension and the
112      * {@linkplain TiesConfiguration#CONF standard configuration}.
113      */
114     public AverageLength() {
115         this("avl");
116     }
117 
118     /***
119      * Creates a new instance, using the
120      * {@linkplain TiesConfiguration#CONF standard configuration}.
121      *
122      * @param outExt the extension to use for output files
123      */
124     public AverageLength(final String outExt) {
125         this(outExt, TiesConfiguration.CONF);
126     }
127 
128     /***
129      * Creates a new instance.
130      *
131      * @param outExt the extension to use for output files
132      * @param conf the configuration to use
133      */
134     public AverageLength(final String outExt, final TiesConfiguration conf)  {
135         super(outExt, conf);
136         tokenizer = new TokenizerFactory(conf).createTokenizer("");
137     }
138 
139     /***
140      * Calculates the average length (in visible characters and tokens)
141      * for all extractions of all types and all evaluation statuses
142      * processed do far. Requires at least one previous call to one of the
143      * {@link #updateAverageLengths(ExtractionContainer) updateAverageLengths}
144      * methods -- otherwise there is nothing to calculate.
145      *
146      * @return an array of two field containers containing the average character
147      * counts (first container) and average token counts (second container)
148      * in a two-dimensional matrix
149      * @throws IllegalStateException if no update method has been invoked
150      */
151     public FieldContainer[] calculateAverageLengths()
152     throws IllegalStateException {
153         final FieldContainer charContainer =
154             FieldContainer.createFieldContainer(getConfig());
155         final FieldContainer tokenContainer =
156             FieldContainer.createFieldContainer(getConfig());
157         final SortedSet<String> statusSet =
158             new TreeSet<String>(EvalStatus.allInstanceStrings());
159 
160         // ensure that there is something to do
161         if (typeNames == null) {
162             throw new IllegalStateException(
163                 "calculateAverageLengths without prior updateAverageLengths");
164         }
165 
166         final Iterator<String> typeIter = typeNames.iterator();
167         Iterator<String> statusIter;
168         String type, statusName, key;
169         FieldMap charFields, tokenFields;
170         int extCount;
171         double charAvg, tokenAvg;
172 
173         while (typeIter.hasNext()) { // iterate extraction types
174             type = typeIter.next();
175             statusIter = statusSet.iterator();
176             charFields = new FieldMap();
177             tokenFields = new FieldMap();
178             charFields.put(Prediction.KEY_TYPE, type);
179             tokenFields.put(Prediction.KEY_TYPE, type);
180 
181             while (statusIter.hasNext()) {
182                 statusName = statusIter.next();
183                 key = key(type, statusName);
184                 extCount = extBag.getCount(key);
185 
186                 // store both averages if this combination did occur
187                 if (extCount > 0) {
188                     charAvg = (double) charBag.getCount(key) / extCount;
189                     tokenAvg = (double) tokenBag.getCount(key) / extCount;
190                     charFields.put(statusName, charAvg);
191                     tokenFields.put(statusName, tokenAvg);
192                 }
193             }
194 
195             charContainer.add(charFields);
196             tokenContainer.add(tokenFields);
197         }
198         return new FieldContainer[] {charContainer, tokenContainer};
199     }
200 
201     /***
202      * Analyzes an extraction container, updating the average
203      * lengths for extractions of all types and all evaluation statuses.
204      *
205      * @param extractions the container of evaluated extractions
206      */
207     public void updateAverageLengths(final ExtractionContainer extractions) {
208         // init type names from the sort extraction container encountered
209         if (typeNames == null) {
210             typeNames = new TreeSet<String>(
211                     extractions.getTargetStructure().getClassNames());
212         }
213 
214         final Iterator<Extraction> extIter = extractions.iterator();
215         Extraction ext;
216         String type, text, key;
217         EvalStatus status;
218         int numTokens, oldTokenCount, index;
219 
220         while (extIter.hasNext()) {
221             ext = extIter.next();
222             type = ext.getType();
223             text = ext.getText();
224             status = ext.getEvalStatus();
225             key = key(type, status);
226 
227             // update extractions + character count
228             extBag.add(key);
229             charBag.add(key, text.length());
230 
231             // update token count, checking for numerical overflow, just in case
232             numTokens = countTokens(text);
233             oldTokenCount = tokenBag.getCount(key);
234             tokenBag.add(key, numTokens);
235 
236             if (tokenBag.getCount(key) <= oldTokenCount) {
237                 Util.LOG.error("Numerical overflow in token count for "
238                         + key + ": " + oldTokenCount + " -> "
239                         + tokenBag.getCount(key));
240             }
241 
242             // update average F-Metrics, using tokenlength-1 as index
243             index = numTokens - 1;
244             MultiFMetrics metrics;
245 
246             while (avgMetrics.size() < index) {
247                 // insert null elements prior to this one, if necessary
248                 avgMetrics.add(null);
249             }
250 
251             // retrieve metrics for this length, creating it if necessary
252             if (avgMetrics.size() == index) {
253                 metrics = new MultiFMetrics();
254                 avgMetrics.add(metrics);
255             } else {
256                 metrics = avgMetrics.get(index);
257                 if (metrics == null) {
258                     metrics = new MultiFMetrics();
259                     avgMetrics.set(index, metrics);
260                 }
261             }
262 
263             if (status == EvalStatus.CORRECT) {
264                 metrics.incTruePos(type);
265             } else if (status == EvalStatus.MISSING) {
266                 metrics.incFalseNeg(type);
267             } else if (status == EvalStatus.SPURIOUS) {
268                 metrics.incFalsePos(type);
269             } // else there is nothing to update
270         }
271     }
272 
273     /***
274      * Analyzes the serialized contents of an extraction container,
275      * delegating to {@link #updateAverageLengths(ExtractionContainer)}.
276      *
277      * @param reader reader containg the extractions to analyse in
278      * {@link de.fu_berlin.ties.io.DelimSepValues} format; not closed by this
279      * method
280      * @throws IOException if an I/O error occurs while reading the extractions
281      */
282     public void updateAverageLengths(final Reader reader) throws IOException {
283         // read extraction + delegate
284         final FieldContainer fContainer =
285             FieldContainer.createFieldContainer(getConfig());
286         fContainer.read(reader);
287         final ExtractionContainer extraction = new ExtractionContainer(
288                 new TargetStructure(getConfig()), fContainer);
289         updateAverageLengths(extraction);
290     }
291 
292     /***
293      * {@inheritDoc}
294      */
295     public void close(final int errorCount) throws IOException {
296         if (errorCount == 0) {
297             // calculate averages and length-based metrics
298             FieldContainer[] containers = calculateAverageLengths();
299             Map<String, FieldContainer> metricsByLength = metricsByLength();
300             final String basename = "average";
301 
302             // serialize containers
303             final File outDir = IOUtils.determineOutputDirectory(getConfig());
304             containers[0].storeInFile(outDir, basename, "chars", getConfig());
305             containers[1].storeInFile(outDir, basename, "tokens", getConfig());
306 
307             for (String metrics : metricsByLength.keySet()) {
308                 metricsByLength.get(metrics).storeInFile(outDir, basename,
309                         metrics.toLowerCase(), getConfig());
310             }
311 
312             Util.LOG.info("Stored average character counts and token counts "
313                     + " and metrics-by-length in " + basename + "*files");
314         }
315     }
316 
317     /***
318      * Counts the tokens in a text.
319      *
320      * @param text the text to process
321      * @return the number of tokens in the text
322      */
323     private int countTokens(final String text) {
324         tokenizer.reset(text);
325         int result = 0;
326 
327         while (tokenizer.nextToken() != null) {
328             result++;
329         }
330 
331         return result;
332     }
333 
334     /***
335      * {@inheritDoc}
336      */
337     protected void doProcess(final Reader reader, final Writer writer,
338             final ContextMap context) throws IOException, ProcessingException {
339         // delegate, output will be generated in close( ) method
340         updateAverageLengths(reader);
341     }
342 
343     /***
344      * Generates a key string combining type and status of an extraction.
345      *
346      * @param type the type of the extraction
347      * @param status the evaluation status of the extraction
348      * @return a combined string representation
349      */
350     private String key(final String type, final EvalStatus status) {
351         return key(type, status.getName());
352     }
353 
354     /***
355      * Generates a key string combining type and status of an extraction.
356      *
357      * @param type the type of the extraction
358      * @param statusString string representing the evaluation status
359      * @return a combined string representation
360      */
361     private String key(final String type, final String statusString) {
362         // use space to separate type + status
363         return type + ' ' + statusString;
364     }
365 
366     /***
367      * Returns the usual {@link FMetrics metrics} F-measure, precision and
368      * recall, calculated separately for all extractions of the same type (as
369      * usual) <em>and token  length</em>. The return map contains the names of
370      * the three metrics as keys and an 2-dimensional representation of their
371      * values, indexed by extraction types as column names and token lengths
372      * as row names. In a fourth container, the number of answer keys
373      * (expected extractions) is returned.
374      *
375      * @return a mapping from metrics to field containers as described above
376      * @throws IllegalStateException if no update method has been invoked
377      */
378     public Map<String, FieldContainer> metricsByLength()
379     throws IllegalStateException {
380         // ensure that there is something to do
381         if (typeNames == null) {
382             throw new IllegalStateException(
383                 "calculateAverageLengths without prior updateAverageLengths");
384         }
385 
386         final FieldContainer fMeasure =
387             FieldContainer.createFieldContainer(getConfig());
388         final FieldContainer precision =
389             FieldContainer.createFieldContainer(getConfig());
390         final FieldContainer recall =
391             FieldContainer.createFieldContainer(getConfig());
392         final FieldContainer answerKeys =
393             FieldContainer.createFieldContainer(getConfig());
394         FieldMap fMap, pMap, rMap, ansMap;
395 
396         final int maxLength = avgMetrics.size();
397         MultiFMetrics multiMetrics;
398         FMetricsView metricsView;
399         Iterator<String> typeIter;
400         String extractionType;
401         long expectedAnswers;
402 
403         for (int i = 0; i < maxLength;) {
404             multiMetrics = avgMetrics.get(i++); // increasing here, not in "for"
405 
406             if (multiMetrics != null) {
407                 // index has been increased so it is now the token length
408                 fMap = new FieldMap(KEY_TOKEN_LENGTH, i);
409                 pMap = new FieldMap(KEY_TOKEN_LENGTH, i);
410                 rMap = new FieldMap(KEY_TOKEN_LENGTH, i);
411                 ansMap = new FieldMap(KEY_TOKEN_LENGTH, i);
412 
413                 // serialize metrics (F, P, R) for all types for this length
414                 typeIter = typeNames.iterator();
415                 while (typeIter.hasNext()) {
416                     extractionType = typeIter.next();
417                     metricsView = multiMetrics.view(extractionType);
418 
419                     if (metricsView != null) {
420                         fMap.put(extractionType, metricsView.getF1Measure());
421                         pMap.put(extractionType, metricsView.getPrecision());
422                         rMap.put(extractionType, metricsView.getRecall());
423                         expectedAnswers = metricsView.getTruePos()
424                             + metricsView.getFalseNeg();
425                         ansMap.put(extractionType, expectedAnswers);
426                     }
427                 }
428 
429                 // add maps to respective containers
430                 fMeasure.add(fMap);
431                 precision.add(pMap);
432                 recall.add(rMap);
433                 answerKeys.add(ansMap);
434             } else {
435                 Util.LOG.debug("No metrics found for token length " + i);
436             }
437         }
438 
439         // create and return result
440         final Map<String, FieldContainer> result =
441             new HashMap<String, FieldContainer>();
442         result.put(FMetrics.KEY_F1_MEASURE, fMeasure);
443         result.put(FMetrics.KEY_PRECISION, precision);
444         result.put(FMetrics.KEY_RECALL, recall);
445         result.put("AnswerKeys", answerKeys);
446         return result;
447     }
448 
449 }