1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.classify;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.OutputStreamWriter;
28 import java.util.HashMap;
29 import java.util.Iterator;
30 import java.util.Map;
31 import java.util.Set;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34
35 import org.apache.commons.configuration.Configuration;
36 import org.apache.commons.lang.builder.ToStringBuilder;
37
38 import de.fu_berlin.ties.ContextMap;
39 import de.fu_berlin.ties.ProcessingException;
40 import de.fu_berlin.ties.TiesConfiguration;
41 import de.fu_berlin.ties.classify.feature.FeatureTransformer;
42 import de.fu_berlin.ties.classify.feature.FeatureVector;
43 import de.fu_berlin.ties.io.IOUtils;
44 import de.fu_berlin.ties.io.ObjectElement;
45 import de.fu_berlin.ties.text.TextUtils;
46 import de.fu_berlin.ties.util.CollUtils;
47 import de.fu_berlin.ties.util.Util;
48
49 /***
50 * A proxy to the MoonFilter classifier written by Christian Siefkes.
51 * The <code>moonrunner.lua</code> must be installed and in the path.
52 * Note that using this classifier with
53 * {@link de.fu_berlin.ties.classify.feature.FeatureTransformer}s is probably a
54 * bad idea since OSB is already integrated into MoonFilter.
55 *
56 * <p>Any configured "classifier.moon.param.NAME = VALUE" pairs will be passed
57 * to the MoonFilter for configuring itself (e.g., you can specify
58 * "classifier.moon.param.buckets = NUMBER" to modify the number of buckets
59 * in the database files).
60 *
61 * <p>Instances of this class are thread-safe. For efficient use with
62 * {@link de.fu_berlin.ties.classify.MultiBinaryClassifier} and
63 * {@link de.fu_berlin.ties.classify.OneAgainstTheRestClassifier}, the used
64 * <code>moonrunner.lua</code> is a static singleton that is shared by all
65 * instances.
66 *
67 * @author Christian Siefkes
68 * @version $Revision: 1.14 $, $Date: 2006/12/02 15:33:54 $, $Author: siefkes $
69 */
70 public class MoonClassifier extends TrainableClassifier {
71
72 /***
73 * Configuration key: the directory to run the classifier in (optional,
74 * defaults to current working directory).
75 */
76 public static final String CONFIG_DIR = "classifier.moon.directory";
77
78 /***
79 * MoonFilter key: whether classified instances should be reinforced.
80 */
81 private static final String KEY_REINFORCE = "reinforce";
82
83 /***
84 * The wrapper MoonFilter instance. Will be initialized on demand.
85 */
86 private static Process moonClassifier = null;
87
88 /***
89 * Used to guard synchronization.
90 */
91 private static Object guard = new Object();
92
93 /***
94 * Used to feed commands to the classifier.
95 */
96 private static OutputStreamWriter feedToFilter = null;
97
98 /***
99 * Used to read the results returned by the classifier.
100 */
101 private static InputStream readFromFilter = null;
102
103 /***
104 * Counts the number of instances created.
105 */
106 private static int instanceCounter = 0;
107
108 /***
109 * The flat list of currently active classes.
110 */
111 private static Set currentClasses = null;
112
113 /***
114 * The {@link #instanceSuffix} used for the {@link #currentClasses set of
115 * currently active classes}.
116 */
117 private static String currentInstanceSuffix = "";
118
119 /***
120 * The currently active feature vector.
121 */
122 private static FeatureVector currentFeatures = null;
123
124 /***
125 * Regular expression matcher for "COMMAND ok" responses.
126 */
127 private static final Matcher OK_MATCHER =
128 Pattern.compile("^//S+//s+ok").matcher("");
129
130 /***
131 * Regex matcher for simple name=value pairs.
132 */
133 private static final Matcher NAME_VALUE_MATCHER =
134 Pattern.compile("([^//s=]+)=([^//s=]+)").matcher("");
135
136 /***
137 * The directory to run the classifier in (if null, the current working
138 * directory is used).
139 */
140 private final File workDir;
141
142 /***
143 * Count the number of misclassifications.
144 */
145 private long misclassifications = 0;
146
147 /***
148 * Count the number of reinforcement trainings.
149 */
150 private long reinforcements = 0;
151
152 /***
153 * Used to disambiguate between classes in different instances, if
154 * necessary (may be empty but not <code>null</code>).
155 */
156 private final String instanceSuffix;
157
158 /***
159 * Byte buffer for reading the answers returned by the wrapped MoonFilter
160 * (we assume they'll all fit into 64K).
161 */
162 private final byte[] answerBuffer = new byte[64 * 1024];
163
164 /***
165 * Creates a new instance based on the
166 * {@linkplain TiesConfiguration#CONF standard configuration}.
167 *
168 * @param allValidClasses the set of all valid classes
169 * @throws ProcessingException if an I/O error occurs during initialization
170 */
171 public MoonClassifier(final Set<String> allValidClasses)
172 throws ProcessingException {
173 this(allValidClasses, TiesConfiguration.CONF);
174 }
175
176 /***
177 * Creates a new instance based on the provided configuration.
178 *
179 * @param allValidClasses the set of all valid classes
180 * @param config contains configuration properties
181 * @throws ProcessingException if an I/O error occurs during initialization
182 */
183 public MoonClassifier(final Set<String> allValidClasses,
184 final TiesConfiguration config) throws ProcessingException {
185 this(allValidClasses, FeatureTransformer.createTransformer(config),
186 config);
187 }
188
189 /***
190 * Creates a new instance based on the provided arguments.
191 *
192 * @param allValidClasses the set of all valid classes
193 * @param trans the last transformer in the transformer chain to use, or
194 * <code>null</code> if no feature transformers should be used
195 * @param config contains configuration properties
196 * @throws ProcessingException if an I/O error occurs during initialization
197 */
198 public MoonClassifier(final Set<String> allValidClasses,
199 final FeatureTransformer trans, final TiesConfiguration config)
200 throws ProcessingException {
201 super(allValidClasses, trans, config);
202
203 if (trans != null) {
204 Util.LOG.warn("Using MoonClassifier with feature transformers is"
205 + " probably a bad idea since OSB is already part of "
206 + "moonfilter.lua: " + trans);
207 }
208
209
210 if (config.containsKey(CONFIG_DIR)) {
211 workDir = new File(config.getString(CONFIG_DIR));
212 } else {
213 workDir = null;
214 }
215
216
217 synchronized (guard) {
218
219 if (instanceCounter == 0) {
220
221 instanceSuffix = "";
222
223 initialize();
224 } else {
225
226 instanceSuffix = "-" + instanceCounter;
227 }
228 instanceCounter++;
229
230
231 adjustClasses(allValidClasses);
232 try {
233 executeCommand("create", "");
234 } catch (ProcessingException pe) {
235
236 Util.LOG.info("Class databases for "
237 + allValidClasses.toString() + " seem to exist already");
238 }
239 }
240 }
241
242
243 /***
244 * Adjust a feature vector to use for training or classification. The
245 * feature vector will only be transferred to the MoonFilter if necessary
246 * (the same feature vector will be re-used for future operations).
247 *
248 * @param features the feature vector to use
249 * @throws ProcessingException if an error occurs while communicating with
250 * the classifier.
251 */
252 private void adjustFeatures(final FeatureVector features)
253 throws ProcessingException {
254
255 if (features != currentFeatures) {
256
257
258 final StringBuilder builder = new StringBuilder("readuntil");
259 builder.append(TextUtils.LINE_SEPARATOR);
260 builder.append(features.flatten());
261 executeCommand(builder.toString(), TextUtils.LINE_SEPARATOR);
262 currentFeatures = features;
263 }
264 }
265
266 /***
267 * Ensure that classes are set correctly. This method should always be
268 * executed in a synchronized context (as it doesn't synchronize itself).
269 *
270 * @param classes the set of class names (Strings) to use
271 * @throws ProcessingException if an error occurs while communicating with
272 * the classifier.
273 */
274 private void adjustClasses(final Set classes)
275 throws ProcessingException {
276
277
278 if (!classes.equals(currentClasses)
279 || !instanceSuffix.equals(currentInstanceSuffix)) {
280
281 final String flatClassList = CollUtils.flatten(classes.iterator(),
282 instanceSuffix + " ") + instanceSuffix;
283
284
285 executeCommand("classes", flatClassList);
286 currentClasses = classes;
287 currentInstanceSuffix = instanceSuffix;
288 }
289 }
290
291 /***
292 * Classifies an item that is represented by a feature vector by choosing
293 * the most probable class among a set of candidate classes.
294 *
295 * @param features the feature vector to consider
296 * @param candidateClasses an array of the classes that are allowed for
297 * this item
298 * @param context used to store context information to be used for training
299 * @return the result of the classification; you can call
300 * {@link PredictionDistribution#best()} to get the most probably class;
301 * this classifier returns only the best prediction, so
302 * {@link PredictionDistribution#size()} will be 1
303 * @throws ProcessingException if an I/O error occurs during communication
304 * with the external program
305 */
306 protected PredictionDistribution doClassify(final FeatureVector features,
307 final Set candidateClasses, final ContextMap context)
308 throws ProcessingException {
309 final Map<String, String> result;
310
311 synchronized (guard) {
312
313 adjustClasses(candidateClasses);
314 adjustFeatures(features);
315
316
317 result = executeCommand("classify", "");
318 }
319
320
321
322
323 final PredictionDistribution predDist = new PredictionDistribution();
324 final String winnerClass = stripInstanceSuffix(result.get("class"));
325
326 if (candidateClasses.size() == 2) {
327 final double pR = Util.asDouble(result.get("pR"));
328
329
330 final double winnerProb = 1.0 / (1.0 + Math.exp(-Math.sqrt(
331 Math.abs(pR))));
332 final double loserProb = 1.0 - winnerProb;
333
334 predDist.add(new Prediction(winnerClass,
335 new Probability(winnerProb)));
336
337
338 String loserClass;
339 final Iterator candidateIter = candidateClasses.iterator();
340 do {
341 loserClass = (String) candidateIter.next();
342 } while (loserClass.equals(winnerClass));
343
344 predDist.add(new Prediction(loserClass,
345 new Probability(loserProb)));
346
347 } else {
348 predDist.add(new Prediction(winnerClass,
349 new Probability(Util.asDouble(result.get("prob")))));
350 }
351
352
353
354 context.put(KEY_REINFORCE, result.get(KEY_REINFORCE));
355 return predDist;
356 }
357
358 /***
359 * {@inheritDoc} <strong>Currently this method is not implemented since
360 * Moonfilter needs to know the list of active classes.</strong>
361 */
362 protected void doTrain(final FeatureVector features,
363 final String targetClass, final ContextMap context)
364 throws ProcessingException {
365
366
367 throw new UnsupportedOperationException("Moonfilter needs to know the "
368 + "list of active classes -- call trainOnError instead of train");
369 }
370
371 /***
372 * Helper method than sends a command to the MoonFilter and returns the
373 * result as a hash map. The current implementatation only parses simple
374 * "name=value" pairs correctly (no nested or quoted values) and does not
375 * handle multi-line responses (with backslash-escaped line ends).
376 * This method should always be executed in a synchronized context (as it
377 * doesn't synchronize itself).
378 *
379 * @param command the command to execute
380 * @param params a string containing the parameters for the command, if any
381 * -- may be empty but not <code>null</code>
382 * @return the results (if any) of the command as a map of strings
383 * @throws ProcessingException thrown if the MoonFilter returns "fail"
384 * or does not respond or an I/O error occurs during communication
385 */
386 private Map<String, String> executeCommand(final String command,
387 final String params) throws ProcessingException {
388 final Map<String, String> result = new HashMap<String, String>();
389 try {
390
391 IOUtils.writeLine(command + " " + params, feedToFilter);
392 feedToFilter.flush();
393 final int bytesRead =
394 IOUtils.readUntilLineEnd(readFromFilter, answerBuffer);
395 final String answer = new String(answerBuffer, 0, bytesRead);
396
397 if (OK_MATCHER.reset(answer).find()) {
398
399 NAME_VALUE_MATCHER.reset(answer);
400 while (NAME_VALUE_MATCHER.find()) {
401 result.put(NAME_VALUE_MATCHER.group(1),
402 NAME_VALUE_MATCHER.group(2));
403 }
404 } else {
405
406 throw new ProcessingException("Command '" + command + " "
407 + params + "' failed: " + answer);
408 }
409 } catch (IOException ioe) {
410
411 throw new ProcessingException("I/O error while executing command",
412 ioe);
413 }
414 return result;
415 }
416
417 /***
418 * Initializes the classifier. This method should always be executed in a
419 * synchronized context (as it doesn't synchronize itself).
420 *
421 * @throws ProcessingException if an I/O error occurs during initialization
422 */
423 private void initialize() throws ProcessingException {
424
425 try {
426 final ProcessBuilder pb = new ProcessBuilder("moonrunner.lua");
427 pb.redirectErrorStream(true);
428 if (workDir != null) {
429 pb.directory(workDir);
430 }
431 moonClassifier = pb.start();
432 feedToFilter =
433 new OutputStreamWriter(moonClassifier.getOutputStream());
434 readFromFilter = moonClassifier.getInputStream();
435 } catch (IOException ioe) {
436
437 throw new ProcessingException(
438 "I/O error while initializing classifier", ioe);
439 }
440
441
442 final Configuration paramConfig =
443 getConfig().subset("classifier.moon.param");
444 final Iterator keyIter = paramConfig.getKeys();
445 String key, value;
446
447 while (keyIter.hasNext()) {
448 key = (String) keyIter.next();
449 value = paramConfig.getString(key);
450 Util.LOG.debug("Configuring MoonClassifier: " + key + " = "
451 + value);
452 executeCommand(key, value);
453 }
454 }
455
456 /***
457 * {@inheritDoc}
458 */
459 public void reset() throws ProcessingException {
460 synchronized (guard) {
461 if (moonClassifier != null) {
462
463 adjustClasses(getAllClasses());
464 executeCommand("destroy", "");
465 executeCommand("create", "");
466 Util.LOG.info("Reset class databases for "
467 + getAllClasses().toString());
468 } else {
469 Util.LOG.debug(
470 "Nothing to reset: wrapped MoonFilter instance is null");
471 }
472 }
473 }
474
475 /***
476 * Checks whether training is necessary, either due to a misclassification
477 * or for reinforcement.
478 *
479 * @param targetClass the expected class of this feature vector; must be
480 * contained in the set of <code>candidateClasses</code>
481 * @param predDist the prediction distribution returned by
482 * {@link #doClassify(FeatureVector, Set, ContextMap)}
483 * @param context used to transport the reinforcement status returned by
484 * the MoonFilter
485 * @return whether to train this instance
486 */
487 private boolean isTrainingNecessary(final String targetClass,
488 final PredictionDistribution predDist, final ContextMap context) {
489
490
491 final boolean result;
492
493 if (shouldTrain(targetClass, predDist, context)) {
494 result = true;
495
496 } else {
497 final String reinforceParam = (String) context.get(KEY_REINFORCE);
498
499 if ("true".equals(reinforceParam)) {
500 result = true;
501
502 } else if ("false".equals(reinforceParam)) {
503 result = false;
504 } else {
505
506 throw new IllegalArgumentException(
507 "Invalid reinforcement parameter: " + reinforceParam);
508 }
509 }
510 return result;
511 }
512
513 /***
514 * Strips the {@link #instanceSuffix} (if any) from a string, ensuring that
515 * the string does actually end in the suffix.
516 *
517 * @param str the string to convert
518 * @return the string without the instance suffix
519 * @throws IllegalArgumentException if the string does not end in the
520 * instance suffix
521 */
522 private String stripInstanceSuffix(final String str)
523 throws IllegalArgumentException {
524 if (instanceSuffix.length() == 0) {
525
526 return str;
527 } else {
528 final int difference = str.length() - instanceSuffix.length();
529
530 if (difference <= 0
531 || !str.substring(difference).equals(instanceSuffix)) {
532 throw new IllegalArgumentException("'" + str
533 + "' does not end in instance suffix '"
534 + instanceSuffix + "'");
535 }
536
537
538 return str.substring(0, difference);
539 }
540 }
541
542 /***
543 * {@inheritDoc} Currently, this classifier does not support XML
544 * serialization, throwing an {@link UnsupportedOperationException} instead.
545 *
546 * @throws UnsupportedOperationException always thrown by this
547 * implementation
548 */
549 public ObjectElement toElement() throws UnsupportedOperationException {
550 throw new UnsupportedOperationException(
551 "XML serialization is not supported by MoonClassifier");
552 }
553
554 /***
555 * {@inheritDoc}
556 */
557 protected boolean trainOnErrorHook(final PredictionDistribution predDist,
558 final FeatureVector features, final String targetClass,
559 final Set candidateClasses, final ContextMap context)
560 throws ProcessingException {
561 if (isTrainingNecessary(targetClass, predDist, context)) {
562 final Map<String, String> result;
563 final String fullTargetClass = targetClass + instanceSuffix;
564
565 synchronized (guard) {
566
567 adjustClasses(candidateClasses);
568 adjustFeatures(features);
569
570
571 result = executeCommand("train", fullTargetClass);
572 }
573
574
575 if (Util.asBoolean(result.get("misclassified"))) {
576 misclassifications++;
577 }
578 if (Util.asBoolean(result.get("reinforced"))) {
579 reinforcements++;
580 }
581
582 }
583
584
585 return true;
586 }
587
588 /***
589 * Returns a string representation of this object.
590 *
591 * @return a textual representation
592 */
593 public String toString() {
594 final ToStringBuilder builder = new ToStringBuilder(this)
595 .appendSuper(super.toString())
596 .append("instance suffix", instanceSuffix)
597 .append("active classes", currentClasses)
598 .append("misclassifications", misclassifications)
599 .append("reinforcements", reinforcements);
600 return builder.toString();
601 }
602
603 }