1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.classify;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.util.List;
27
28 import org.apache.commons.lang.builder.ToStringBuilder;
29
30 import de.fu_berlin.ties.CollectingProcessor;
31 import de.fu_berlin.ties.ContextMap;
32 import de.fu_berlin.ties.ProcessingException;
33 import de.fu_berlin.ties.TiesConfiguration;
34 import de.fu_berlin.ties.classify.feature.FeatureExtractor;
35 import de.fu_berlin.ties.classify.feature.FeatureExtractorFactory;
36 import de.fu_berlin.ties.classify.feature.FeatureVector;
37 import de.fu_berlin.ties.io.IOUtils;
38 import de.fu_berlin.ties.util.CollUtils;
39
40 /***
41 * A text filter provides a simple API for classifying text files. All instances
42 * of this class share a common
43 * {@link de.fu_berlin.ties.classify.TrainableClassifier} which will be
44 * initialized by the first created instance. This class it meant to be
45 * used with <a href="http://www.martiansoftware.com/nailgun/">NailGun</a> to
46 * avoid the cost of creating the virtual machine and to allow re-using the
47 * same classifier instance between multiple calls.
48 *
49 * <p>The classifier is configured from the provided
50 * configuration, using the
51 * {@link de.fu_berlin.ties.classify.ClassTrain#CONFIG_SUFFIX_TEXT} suffix
52 * to allow text-specific settings. The classes to consider for classification
53 * are read from the {@link TextFilter#CONFIG_CLASSES} parameter.
54 * The probability of the very first class will be returned as "score".
55 *
56 * <p>This class is meant to be invoked on the command line as "filter" goal to
57 * classify or train a text file. It supports two commands, "classify" and
58 * "train":
59 * <dl>
60 * <dt>classify FILENAME<dt>
61 * <dd>Classifies the given file, writing a single line of output to
62 * {@link System#out}:<br>
63 * <tt>class=PREDICTED-CLASS score=PROB.-OF-FIRST-CLASS
64 * prob==PROB.-OF-PREDICTED-CLASS</tt></dd>
65 * <dt>train TRUE-CLASS FILENAME<dt>
66 * <dd>Trains the given file as TRUE-CLASS (must be one of the configured
67 * classes).</dd>
68 * </dl>
69 *
70 * <p>Additional parameters after the required arguments are allowed but
71 * ignored; other commands will be treated as errors.
72 *
73 * @author Christian Siefkes
74 * @version $Revision: 1.7 $, $Date: 2006/10/21 16:03:55 $, $Author: siefkes $
75 */
76 public class TextFilter extends CollectingProcessor {
77
78 /***
79 * Configuration key: Names of the classes used to filter text.
80 * The probability of the very first class will be returned as "score".
81 */
82 public static final String CONFIG_CLASSES = "textfilter.classes";
83
84 /***
85 * Used to guard access to the classifier.
86 */
87 private static final Object GUARD = new Object();
88
89 /***
90 * Classifier shared by all instances of this object.
91 */
92 private static TrainableClassifier classifier = null;
93
94 /***
95 * Used to convert text sequences into feature vectors.
96 */
97 private static FeatureExtractor featureExt;
98
99 /***
100 * The class whose score (probability) should be output.
101 */
102 private static String scoreClass = null;
103
104 /***
105 * Used to cache the last generated feature vector between successive calls.
106 */
107 private static FeatureVector cachedFeatures = null;
108
109 /***
110 * Used to cache the last generated feature vector between successive calls.
111 */
112 private static String cachedAbsPath = "";
113
114 /***
115 * Used to cache the last generated feature vector between successive calls.
116 */
117 private static long cachedLastModified = -1;
118
119 /***
120 * Used to cache the last generated feature vector between successive calls.
121 */
122 private static long cachedLength = -1;
123
124
125 /***
126 * Creates a new instance, configured from the
127 * {@linkplain TiesConfiguration#CONF default configuration}.
128 *
129 * @throws ProcessingException if the configured classifier instance cannot
130 * be instantiated
131 */
132 public TextFilter() throws ProcessingException {
133 this(TiesConfiguration.CONF);
134 }
135
136 /***
137 * Creates a new instance.
138 *
139 * @param conf used to configure this instance
140 * @throws ProcessingException if the configured classifier instance cannot
141 * be instantiated
142 */
143 public TextFilter(final TiesConfiguration conf) throws ProcessingException {
144 super(conf);
145
146
147 synchronized (GUARD) {
148 if (scoreClass == null) {
149 final String[] classNames = conf.getStringArray(CONFIG_CLASSES);
150
151 if (classNames.length < 2) {
152 throw new IllegalArgumentException(
153 "Not enough classes to filter text: "
154 + classNames.length);
155 }
156
157 scoreClass = classNames[0];
158 featureExt = FeatureExtractorFactory.createExtractor(
159 conf, Classifier.CONFIG_CLASSIFIER);
160 classifier = TrainableClassifier.createClassifier(
161 CollUtils.arrayAsSet(classNames), conf,
162 ClassTrain.CONFIG_SUFFIX_TEXT);
163 }
164 }
165 }
166
167 /***
168 * Helper method for extracting features from a text file. Caches and
169 * re-uses features from the last call to this method if possible. This
170 * method is meant to be evoked in a synchronized context, if necessary;
171 * it is not synchronized by itself.
172 *
173 * @param file the file to process
174 * @return a feature vector representing the file contents
175 * @throws IOException if an I/O error occurs
176 * @throws ProcessingException if an error occurs while processing the text
177 */
178 private FeatureVector buildFeatures(final File file)
179 throws IOException, ProcessingException {
180 final FeatureVector result;
181 final String absolutePath = file.getAbsolutePath();
182 final long lastModified = file.lastModified();
183 final long length = file.length();
184
185
186 if (absolutePath.equals(cachedAbsPath)
187 && (cachedLastModified == lastModified)
188 && (cachedLength == length)) {
189
190 result = cachedFeatures;
191 } else {
192 result = featureExt.buildFeatures(
193 IOUtils.openReader(file, getConfig()));
194
195
196 cachedFeatures = result;
197 cachedAbsPath = absolutePath;
198 cachedLastModified = lastModified;
199 cachedLength = length;
200 }
201
202 return result;
203 }
204
205 /***
206 * {@inheritDoc}
207 */
208 public void process(final List<String> collected, final ContextMap context)
209 throws IOException, ProcessingException {
210
211 if (collected.isEmpty()) {
212 throw new IllegalArgumentException("Missing command");
213 }
214 final String command = collected.get(0);
215 final String filename;
216 final FeatureVector features;
217
218 if ("classify".equals(command)) {
219
220 if (collected.size() < 2) {
221 throw new IllegalArgumentException(
222 "Not enough arguments for classify command");
223 }
224 filename = collected.get(1);
225
226 synchronized (GUARD) {
227 features = buildFeatures(new File(filename));
228 final PredictionDistribution predDist =
229 classifier.classify(features, classifier.getAllClasses());
230
231
232 final Prediction bestPred = predDist.best();
233 final String predictedClass = bestPred.getType();
234 final double bestProb = bestPred.getProbability().getProb();
235
236
237 final Prediction scorePred =
238 predDist.getPredictionForType(scoreClass);
239 final double scoreProb = scorePred.getProbability().getProb();
240
241
242
243 final StringBuilder output = new StringBuilder("class=")
244 .append(predictedClass)
245 .append(" score=")
246 .append(scoreProb)
247 .append(" prob=")
248 .append(bestProb);
249 System.out.println(output.toString());
250 }
251 } else if ("train".equals(command)) {
252
253 if (collected.size() < 3) {
254 throw new IllegalArgumentException(
255 "Not enough arguments for train command");
256 }
257 final String trueClass = collected.get(1);
258 filename = collected.get(2);
259
260 synchronized (GUARD) {
261 features = buildFeatures(new File(filename));
262 classifier.trainOnError(features, trueClass,
263 classifier.getAllClasses());
264 }
265 } else {
266 throw new IllegalArgumentException("Unknown command: " + command);
267 }
268 }
269
270 /***
271 * Returns a string representation of this object.
272 *
273 * @return a textual representation
274 */
275 public String toString() {
276 return new ToStringBuilder(this)
277 .append("classifier", classifier)
278 .toString();
279 }
280
281 }