1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.classify;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.io.Writer;
28 import java.util.HashSet;
29 import java.util.Iterator;
30 import java.util.Set;
31
32 import de.fu_berlin.ties.ContextMap;
33 import de.fu_berlin.ties.ProcessingException;
34 import de.fu_berlin.ties.TextProcessor;
35 import de.fu_berlin.ties.TiesConfiguration;
36 import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
37 import de.fu_berlin.ties.classify.feature.FeatureVector;
38 import de.fu_berlin.ties.io.FieldContainer;
39 import de.fu_berlin.ties.io.FieldMap;
40 import de.fu_berlin.ties.io.IOUtils;
41 import de.fu_berlin.ties.text.TextTokenizer;
42 import de.fu_berlin.ties.text.TokenizerFactory;
43 import de.fu_berlin.ties.util.Util;
44
45 /***
46 * Classifies a list of files, training the classifier on each error. See
47 * {@link #classifyAndTrain(FieldContainer, File, String)} for a description of
48 * input and output formats.
49 *
50 * <p>This class does not calculate statistics; you can do so be calling e.g.
51 * <code>tail -q --lines 500 <em>FILENAME</em>|grep -v "|+"|wc</code> on the
52 * output serialized in {@link de.fu_berlin.ties.io.DelimSepValues} format to
53 * get the number of errors during the last 500 classifications (assuming that
54 * classes to not start with a "+").
55 *
56 * <p>Instances of this class are thread-safe.
57 *
58 * @author Christian Siefkes
59 * @version $Revision: 1.13 $, $Date: 2004/11/17 09:15:10 $, $Author: siefkes $
60 */
61 public class ClassTrain extends TextProcessor {
62
63 /***
64 * Configuration key: The extension to append to file names given via the
65 * {@linkplain #KEY_FILE File key} (if any).
66 */
67 public static final String CONFIG_FILE_EXT = "file.ext";
68
69 /***
70 * Serialization key for the name of the file to classify.
71 */
72 public static final String KEY_FILE = "File";
73
74 /***
75 * Serialization key for the correct class.
76 */
77 public static final String KEY_CLASS = "Class";
78
79 /***
80 * Serialization key for the result of the classification: either
81 * {@link #CORRECT_CLASS} if the correct class was predicted or the
82 * wrongly predicted class in case of an error.
83 */
84 public static final String KEY_CLASSIFICATION = "Classification";
85
86 /***
87 * Value of the {@link #KEY_CLASSIFICATION} field for correct predictions:
88 * {@value}.
89 */
90 public static final String CORRECT_CLASS = "+";
91
92 /***
93 * Used to create tokenizers.
94 */
95 private final TokenizerFactory tFactory;
96
97 /***
98 * The extension to append to file names given via the
99 * {@linkplain #KEY_FILE File key}; empty string if none.
100 */
101 private final String fileExtension;
102
103 /***
104 * Creates a new instance using a default extension and the
105 * {@link TiesConfiguration#CONF standard configuration}.
106 */
107 public ClassTrain() {
108 this("cls");
109 }
110
111 /***
112 * Creates a new instance using the
113 * {@link TiesConfiguration#CONF standard configuration}.
114 *
115 * @param outExt the extension to use for output files
116 */
117 public ClassTrain(final String outExt) {
118 this(outExt, TiesConfiguration.CONF);
119 }
120
121 /***
122 * Creates a new instance from the provided configuration.
123 *
124 * @param outExt the extension to use for output files
125 * @param conf used to configure this instance; if <code>null</code>,
126 * the {@linkplain TiesConfiguration#CONF standard configuration} is used
127 */
128 public ClassTrain(final String outExt, final TiesConfiguration conf) {
129 this(outExt, conf,
130 new TokenizerFactory(conf, Classifier.CONFIG_CLASSIFIER),
131 conf.getString(CONFIG_FILE_EXT, ""));
132 }
133
134 /***
135 * Creates a new instance.
136 *
137 * @param outExt the extension to use for output files
138 * @param conf used to configure this instance; if <code>null</code>,
139 * the {@linkplain TiesConfiguration#CONF standard configuration} is used
140 * @param factory used to create tokenizers
141 * @param fileExt the extension to append to file names given via the
142 * {@linkplain #KEY_FILE File key}; <code>null</code> or the empty string
143 * if none should be appended
144 */
145 public ClassTrain(final String outExt, final TiesConfiguration conf,
146 final TokenizerFactory factory, final String fileExt) {
147 super(outExt, conf);
148 tFactory = factory;
149 fileExtension = (fileExt != null) ? fileExt : "";
150 }
151
152 /***
153 * Classifies a list of files, training the classifier on each error.
154 *
155 * @param filesToClassify a field container of the files to process; each
156 * entry must contain a {@link #KEY_FILE} field giving the name of the file
157 * to classify and {@link #KEY_CLASS} giving the true class of the file
158 * @param directory file names are relative to this directory; if
159 * <code>null</code> they are relative to the working directory
160 * @param charset the character set of the files to process
161 * @return a field container of the classification results; in addition to
162 * the fields given above, each entry will contain the classification result
163 * in a {@link #KEY_CLASSIFICATION} field: {@link #CORRECT_CLASS} in
164 * case of a correct classification, the name of the wrongly predicted
165 * class otherwise
166 * @throws IOException if an I/O error occurs
167 * @throws ProcessingException if an error occurs during processing
168 */
169 public FieldContainer classifyAndTrain(final FieldContainer filesToClassify,
170 final File directory, final String charset)
171 throws IOException, ProcessingException {
172 final FieldContainer result = FieldContainer.createFieldContainer();
173 final int numFiles = filesToClassify.size();
174 FieldMap inMap;
175 FieldMap outMap;
176 String currentClass;
177 String[] filenames = new String[numFiles];
178 String[] classes = new String[numFiles];
179 final Iterator fileIter = filesToClassify.entryIterator();
180 final Set<String> classSet = new HashSet<String>();
181 int i = 0;
182
183
184 while (fileIter.hasNext()) {
185 inMap = (FieldMap) fileIter.next();
186 filenames[i] = (String) inMap.get(KEY_FILE);
187 currentClass = (String) inMap.get(KEY_CLASS);
188 classSet.add(currentClass);
189 classes[i] = currentClass;
190 i++;
191 }
192
193
194 final TrainableClassifier classifier =
195 TrainableClassifier.createClassifier(classSet, getConfig());
196 final TextTokenizer tokenizer = tFactory.createTokenizer("");
197
198
199 classifier.reset();
200
201 Reader reader;
202 String contents;
203 FeatureVector features;
204 PredictionDistribution predDist;
205 Prediction best;
206
207
208 for (i = 0; i < numFiles; i++) {
209
210 reader = IOUtils.openReader(
211 new File(directory, filenames[i] + fileExtension), charset);
212
213 try {
214
215 contents = IOUtils.readToString(reader);
216 features = new DefaultFeatureVector();
217 features.addAllTokens(contents, tokenizer);
218
219
220 predDist =
221 classifier.trainOnError(features, classes[i], classSet);
222
223
224 outMap = new FieldMap();
225 outMap.put(KEY_FILE, filenames[i]);
226 outMap.put(KEY_CLASS, classes[i]);
227
228 if (predDist == null) {
229 Util.LOG.debug("Processed " + filenames[i] + fileExtension
230 + ": classification as " + classes[i]
231 + " was correct");
232 outMap.put(KEY_CLASSIFICATION, CORRECT_CLASS);
233 } else {
234 best = predDist.best();
235 Util.LOG.debug("Processed " + filenames[i] + fileExtension
236 + ": misclassified as " + best.getType()
237 + " instead of " + classes[i]);
238 outMap.put(KEY_CLASSIFICATION, best.getType());
239 }
240 result.add(outMap);
241 } finally {
242 IOUtils.tryToClose(reader);
243 }
244 }
245
246 Util.LOG.debug("Finished classifying and training using "
247 + classifier + " and " + tokenizer);
248 return result;
249 }
250
251 /***
252 * Delegates to {@link #classifyAndTrain(FieldContainer, File, String)}.
253 *
254 * @param reader the {@link FieldContainer} of files to classify is read
255 * from this reader; not closed by this method
256 * @param writer the resulting {@link FieldContainer} containing
257 * classification results is serialized to this writer; not closed by
258 * this method
259 * @param context a map of objects that are made available for processing;
260 * the {@link IOUtils#KEY_LOCAL_CHARSET} is used to determine the character
261 * set of the listed files; the {@link TextProcessor#KEY_DIRECTORY}
262 * {@link File} determines the source of relative file names, if given
263 * (otherwise the current working directory is used)
264 * @throws IOException if an I/O error occurs
265 * @throws ProcessingException if an error occurs during processing
266 */
267 protected void doProcess(final Reader reader, final Writer writer,
268 final ContextMap context)
269 throws IOException, ProcessingException {
270
271 final FieldContainer filesToClassify =
272 FieldContainer.createFieldContainer(reader);
273 final String charset = (String) context.get(IOUtils.KEY_LOCAL_CHARSET);
274 final File directory = (File) context.get(KEY_DIRECTORY);
275
276
277 final FieldContainer result =
278 classifyAndTrain(filesToClassify, directory, charset);
279
280
281 result.store(writer);
282 }
283
284 }