View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.classify;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Reader;
27  import java.io.Writer;
28  import java.util.HashSet;
29  import java.util.Iterator;
30  import java.util.Set;
31  
32  import de.fu_berlin.ties.ContextMap;
33  import de.fu_berlin.ties.ProcessingException;
34  import de.fu_berlin.ties.TextProcessor;
35  import de.fu_berlin.ties.TiesConfiguration;
36  import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
37  import de.fu_berlin.ties.classify.feature.FeatureVector;
38  import de.fu_berlin.ties.io.FieldContainer;
39  import de.fu_berlin.ties.io.FieldMap;
40  import de.fu_berlin.ties.io.IOUtils;
41  import de.fu_berlin.ties.text.TextTokenizer;
42  import de.fu_berlin.ties.text.TokenizerFactory;
43  import de.fu_berlin.ties.util.Util;
44  
45  /***
46   * Classifies a list of files, training the classifier on each error. See
47   * {@link #classifyAndTrain(FieldContainer, File, String)} for a description of
48   * input and output formats.
49   *
50   * <p>This class does not calculate statistics; you can do so be calling e.g.
51   * <code>tail -q --lines 500 <em>FILENAME</em>|grep -v "|+"|wc</code> on the
52   * output serialized in {@link de.fu_berlin.ties.io.DelimSepValues} format to
53   * get the number of errors during the last 500 classifications (assuming that
54   * classes to not start with a "+").
55   *
56   * <p>Instances of this class are thread-safe.
57   *
58   * @author Christian Siefkes
59   * @version $Revision: 1.13 $, $Date: 2004/11/17 09:15:10 $, $Author: siefkes $
60   */
61  public class ClassTrain extends TextProcessor {
62  
63      /***
64       * Configuration key: The extension to append to file names given via the
65       * {@linkplain #KEY_FILE File key} (if any).
66       */
67      public static final String CONFIG_FILE_EXT = "file.ext";
68  
69      /***
70       * Serialization key for the name of the file to classify.
71       */
72      public static final String KEY_FILE = "File";
73  
74      /***
75       * Serialization key for the correct class.
76       */
77      public static final String KEY_CLASS = "Class";
78  
79      /***
80       * Serialization key for the result of the classification: either
81       * {@link #CORRECT_CLASS} if the correct class was predicted or the
82       * wrongly predicted class in case of an error.
83       */
84      public static final String KEY_CLASSIFICATION = "Classification";
85  
86      /***
87       * Value of the {@link #KEY_CLASSIFICATION} field for correct predictions:
88       * {@value}.
89       */
90      public static final String CORRECT_CLASS = "+";
91  
92      /***
93       * Used to create tokenizers.
94       */
95      private final TokenizerFactory tFactory;
96  
97      /***
98       * The extension to append to file names given via the
99       * {@linkplain #KEY_FILE File key}; empty string if none.
100      */
101     private final String fileExtension;
102 
103     /***
104      * Creates a new instance using a default extension and the
105      * {@link TiesConfiguration#CONF standard configuration}.
106      */
107     public ClassTrain() {
108         this("cls");
109     }
110 
111     /***
112      * Creates a new instance using the
113      * {@link TiesConfiguration#CONF standard configuration}.
114      *
115      * @param outExt the extension to use for output files
116      */
117     public ClassTrain(final String outExt) {
118         this(outExt, TiesConfiguration.CONF);
119     }
120 
121     /***
122      * Creates a new instance from the provided configuration.
123      *
124      * @param outExt the extension to use for output files
125      * @param conf used to configure this instance; if <code>null</code>,
126      * the {@linkplain TiesConfiguration#CONF standard configuration} is used
127      */
128     public ClassTrain(final String outExt, final TiesConfiguration conf) {
129         this(outExt, conf,
130             new TokenizerFactory(conf, Classifier.CONFIG_CLASSIFIER),
131             conf.getString(CONFIG_FILE_EXT, ""));
132     }
133 
134     /***
135      * Creates a new instance.
136      *
137      * @param outExt the extension to use for output files
138      * @param conf used to configure this instance; if <code>null</code>,
139      * the {@linkplain TiesConfiguration#CONF standard configuration} is used
140      * @param factory used to create tokenizers
141      * @param fileExt the extension to append to file names given via the
142      * {@linkplain #KEY_FILE File key}; <code>null</code> or the empty string
143      * if none should be appended
144      */
145     public ClassTrain(final String outExt, final TiesConfiguration conf,
146         final TokenizerFactory factory, final String fileExt) {
147         super(outExt, conf);
148         tFactory = factory;
149         fileExtension = (fileExt != null) ? fileExt : "";
150     }
151 
152     /***
153      * Classifies a list of files, training the classifier on each error.
154      *
155      * @param filesToClassify a field container of the files to process; each
156      * entry must contain a {@link #KEY_FILE} field giving the name of the file
157      * to classify and {@link #KEY_CLASS} giving the true class of the file
158      * @param directory file names are relative to this directory; if
159      * <code>null</code> they are relative to the working directory
160      * @param charset the character set of the files to process
161      * @return a field container of the classification results; in addition to
162      * the fields given above, each entry will contain the classification result
163      * in a {@link #KEY_CLASSIFICATION} field: {@link #CORRECT_CLASS} in
164      * case of a correct classification, the name of the wrongly predicted
165      * class otherwise
166      * @throws IOException if an I/O error occurs
167      * @throws ProcessingException if an error occurs during processing
168      */
169     public FieldContainer classifyAndTrain(final FieldContainer filesToClassify,
170             final File directory, final String charset)
171             throws IOException, ProcessingException {
172         final FieldContainer result = FieldContainer.createFieldContainer();
173         final int numFiles = filesToClassify.size();
174         FieldMap inMap;
175         FieldMap outMap;
176         String currentClass;
177         String[] filenames = new String[numFiles];
178         String[] classes = new String[numFiles];
179         final Iterator fileIter = filesToClassify.entryIterator();
180         final Set<String> classSet = new HashSet<String>();
181         int i = 0;
182 
183         // collect files to process and determine set of classes
184         while (fileIter.hasNext()) {
185             inMap = (FieldMap) fileIter.next();
186             filenames[i] = (String) inMap.get(KEY_FILE);
187             currentClass = (String) inMap.get(KEY_CLASS);
188             classSet.add(currentClass);
189             classes[i] = currentClass;
190             i++;
191         }
192 
193         // initialize classifier and tokenizer
194         final TrainableClassifier classifier =
195             TrainableClassifier.createClassifier(classSet, getConfig());
196         final TextTokenizer tokenizer = tFactory.createTokenizer("");
197 
198         // ensure that prediction model is empty
199         classifier.reset();
200 
201         Reader reader;
202         String contents;
203         FeatureVector features;
204         PredictionDistribution predDist;
205         Prediction best;
206 
207         // classify and if necessary train each listed file
208         for (i = 0; i < numFiles; i++) {
209             // read contents of file (relative to specified directory if any)
210             reader = IOUtils.openReader(
211                 new File(directory, filenames[i] + fileExtension), charset);
212 
213             try {
214                 // create feature vector by tokenizing contents
215                 contents = IOUtils.readToString(reader);
216                 features = new DefaultFeatureVector();
217                 features.addAllTokens(contents, tokenizer);
218 
219                 // delegate to classifier
220                 predDist =
221                     classifier.trainOnError(features, classes[i], classSet);
222 
223                 // log + store results
224                 outMap = new FieldMap();
225                 outMap.put(KEY_FILE, filenames[i]);
226                 outMap.put(KEY_CLASS, classes[i]);
227 
228                 if (predDist == null) {
229                     Util.LOG.debug("Processed " + filenames[i] + fileExtension
230                             + ": classification as " + classes[i]
231                             + " was correct");
232                     outMap.put(KEY_CLASSIFICATION, CORRECT_CLASS);
233                 } else {
234                     best = predDist.best();
235                     Util.LOG.debug("Processed " + filenames[i] + fileExtension
236                             + ": misclassified as " + best.getType()
237                             + " instead of " + classes[i]);
238                     outMap.put(KEY_CLASSIFICATION, best.getType());
239                 }
240                 result.add(outMap);
241             } finally {
242                 IOUtils.tryToClose(reader);
243             }
244         } // for
245 
246         Util.LOG.debug("Finished classifying and training using "
247                 + classifier + " and " + tokenizer);
248         return result;
249     }
250 
251     /***
252      * Delegates to {@link #classifyAndTrain(FieldContainer, File, String)}.
253      *
254      * @param reader the {@link FieldContainer} of files to classify is read
255      * from this reader; not closed by this method
256      * @param writer the resulting {@link FieldContainer} containing
257      * classification results is serialized to this writer; not closed by
258      * this method
259      * @param context a map of objects that are made available for processing;
260      * the {@link IOUtils#KEY_LOCAL_CHARSET} is used to determine the character
261      * set of the listed files; the {@link TextProcessor#KEY_DIRECTORY}
262      * {@link File} determines the source of relative file names, if given
263      * (otherwise the current working directory is used)
264      * @throws IOException if an I/O error occurs
265      * @throws ProcessingException if an error occurs during processing
266      */
267     protected void doProcess(final Reader reader, final Writer writer,
268                              final ContextMap context)
269             throws IOException, ProcessingException {
270         // read input + determine charset + directory (if any)
271         final FieldContainer filesToClassify =
272             FieldContainer.createFieldContainer(reader);
273         final String charset = (String) context.get(IOUtils.KEY_LOCAL_CHARSET);
274         final File directory = (File) context.get(KEY_DIRECTORY);
275 
276         // delegate to classifyAndTrain
277         final FieldContainer result =
278             classifyAndTrain(filesToClassify, directory, charset);
279 
280         // serialize results
281         result.store(writer);
282     }
283 
284 }