View Javadoc

1   /*
2    * Copyright (C) 2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.classify;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.io.OutputStreamWriter;
28  import java.util.HashMap;
29  import java.util.Iterator;
30  import java.util.Map;
31  import java.util.Set;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  import org.apache.commons.configuration.Configuration;
36  import org.apache.commons.lang.builder.ToStringBuilder;
37  
38  import de.fu_berlin.ties.ContextMap;
39  import de.fu_berlin.ties.ProcessingException;
40  import de.fu_berlin.ties.TiesConfiguration;
41  import de.fu_berlin.ties.classify.feature.FeatureTransformer;
42  import de.fu_berlin.ties.classify.feature.FeatureVector;
43  import de.fu_berlin.ties.io.IOUtils;
44  import de.fu_berlin.ties.io.ObjectElement;
45  import de.fu_berlin.ties.text.TextUtils;
46  import de.fu_berlin.ties.util.CollUtils;
47  import de.fu_berlin.ties.util.Util;
48  
49  /***
50   * A proxy to the MoonFilter classifier written by Christian Siefkes.
51   * The <code>moonrunner.lua</code> must be installed and in the path.
52   * Note that using this classifier with
53   * {@link de.fu_berlin.ties.classify.feature.FeatureTransformer}s is probably a
54   * bad idea since OSB is already integrated into MoonFilter.
55   *
56   * <p>Any configured "classifier.moon.param.NAME = VALUE" pairs will be passed
57   * to the MoonFilter for configuring itself (e.g., you can specify
58   * "classifier.moon.param.buckets = NUMBER" to modify the number of buckets
59   * in the database files).
60   *
61   * <p>Instances of this class are thread-safe. For efficient use with
62   * {@link de.fu_berlin.ties.classify.MultiBinaryClassifier} and
63   * {@link de.fu_berlin.ties.classify.OneAgainstTheRestClassifier}, the used
64   * <code>moonrunner.lua</code> is a static singleton that is shared by all
65   * instances.
66   *
67   * @author Christian Siefkes
68   * @version $Revision: 1.14 $, $Date: 2006/12/02 15:33:54 $, $Author: siefkes $
69   */
70  public class MoonClassifier extends TrainableClassifier {
71  
72      /***
73       * Configuration key: the directory to run the classifier in (optional,
74       * defaults to current working directory).
75       */
76      public static final String CONFIG_DIR = "classifier.moon.directory";
77  
78      /***
79       * MoonFilter key: whether classified instances should be reinforced.
80       */
81      private static final String KEY_REINFORCE = "reinforce";
82  
83      /***
84       * The wrapper MoonFilter instance. Will be initialized on demand.
85       */
86      private static Process moonClassifier = null;
87  
88      /***
89       * Used to guard synchronization.
90       */
91      private static Object guard = new Object();
92  
93      /***
94       * Used to feed commands to the classifier.
95       */
96      private static OutputStreamWriter feedToFilter = null;
97  
98      /***
99       * Used to read the results returned by the classifier.
100      */
101     private static InputStream readFromFilter = null;
102 
103     /***
104      * Counts the number of instances created.
105      */
106     private static int instanceCounter = 0;
107 
108     /***
109      * The flat list of currently active classes.
110      */
111     private static Set currentClasses = null;
112 
113     /***
114      * The {@link #instanceSuffix} used for the {@link #currentClasses set of
115      * currently active classes}.
116      */
117     private static String currentInstanceSuffix = "";
118 
119     /***
120      * The currently active feature vector.
121      */
122     private static FeatureVector currentFeatures = null;
123 
124     /***
125      * Regular expression matcher for "COMMAND ok" responses.
126      */
127     private static final Matcher OK_MATCHER =
128         Pattern.compile("^//S+//s+ok").matcher("");
129 
130     /***
131      * Regex matcher for simple name=value pairs.
132      */
133     private static final Matcher NAME_VALUE_MATCHER =
134         Pattern.compile("([^//s=]+)=([^//s=]+)").matcher("");
135 
136     /***
137      * The directory to run the classifier in (if null, the current working
138      * directory is used).
139      */
140     private final File workDir;
141 
142     /***
143      * Count the number of misclassifications.
144      */
145     private long misclassifications = 0;
146 
147     /***
148      * Count the number of reinforcement trainings.
149      */
150     private long reinforcements = 0;
151 
152     /***
153      * Used to disambiguate between classes in different instances, if
154      * necessary (may be empty but not <code>null</code>).
155      */
156     private final String instanceSuffix;
157 
158     /***
159      * Byte buffer for reading the answers returned by the wrapped MoonFilter
160      * (we assume they'll all fit into 64K).
161      */
162     private final byte[] answerBuffer = new byte[64 * 1024];
163 
164     /***
165      * Creates a new instance based on the
166      * {@linkplain TiesConfiguration#CONF standard configuration}.
167      *
168      * @param allValidClasses the set of all valid classes
169      * @throws ProcessingException if an I/O error occurs during initialization
170      */
171     public MoonClassifier(final Set<String> allValidClasses)
172             throws ProcessingException {
173         this(allValidClasses, TiesConfiguration.CONF);
174     }
175 
176     /***
177      * Creates a new instance based on the provided configuration.
178      *
179      * @param allValidClasses the set of all valid classes
180      * @param config contains configuration properties
181      * @throws ProcessingException if an I/O error occurs during initialization
182      */
183     public MoonClassifier(final Set<String> allValidClasses,
184             final TiesConfiguration config) throws ProcessingException {
185         this(allValidClasses, FeatureTransformer.createTransformer(config),
186                 config);
187     }
188 
189     /***
190      * Creates a new instance based on the provided arguments.
191      *
192      * @param allValidClasses the set of all valid classes
193      * @param trans the last transformer in the transformer chain to use, or
194      * <code>null</code> if no feature transformers should be used
195      * @param config contains configuration properties
196      * @throws ProcessingException if an I/O error occurs during initialization
197      */
198     public MoonClassifier(final Set<String> allValidClasses,
199             final FeatureTransformer trans, final TiesConfiguration config)
200             throws ProcessingException {
201         super(allValidClasses, trans, config);
202 
203         if (trans != null) {
204             Util.LOG.warn("Using MoonClassifier with feature transformers is"
205                     + " probably a bad idea since OSB is already part of "
206                     + "moonfilter.lua: " + trans);
207         }
208 
209         // set directory
210         if (config.containsKey(CONFIG_DIR)) {
211             workDir = new File(config.getString(CONFIG_DIR));
212         } else {
213             workDir = null;
214         }
215 
216         // determine suffix and initialize classifier
217         synchronized (guard) {
218             // determine instance suffix
219             if (instanceCounter == 0) {
220                 // empty suffix for first instance
221                 instanceSuffix = "";
222                 // initialize static process
223                 initialize();
224             } else {
225                 // suffixes "-1", "-2" etc. for further instances
226                 instanceSuffix = "-" + instanceCounter;
227             }
228             instanceCounter++;
229 
230             // create class databases, if they don't exist yet
231             adjustClasses(allValidClasses);
232             try {
233                 executeCommand("create", "");
234             } catch (ProcessingException pe) {
235                 // this is not an error -- just re-use the existing databases
236                 Util.LOG.info("Class databases for "
237                     + allValidClasses.toString() + " seem to exist already");
238             }
239         }
240     }
241 
242 
243     /***
244      * Adjust a feature vector to use for training or classification. The
245      * feature vector will only be transferred to the MoonFilter if necessary
246      * (the same feature vector will be re-used for future operations).
247      *
248      * @param features the feature vector to use
249      * @throws ProcessingException if an error occurs while communicating with
250      * the classifier.
251      */
252     private void adjustFeatures(final FeatureVector features)
253     throws ProcessingException {
254         // identity comparision for efficiency
255         if (features != currentFeatures) {
256             //Util.LOG.debug("Resetting feature vector");
257             // send via 'readuntil' command, terminated by an empty line
258             final StringBuilder builder = new StringBuilder("readuntil");
259             builder.append(TextUtils.LINE_SEPARATOR);
260             builder.append(features.flatten());
261             executeCommand(builder.toString(), TextUtils.LINE_SEPARATOR);
262             currentFeatures = features;
263         }
264     }
265 
266     /***
267      * Ensure that classes are set correctly. This method should always be
268      * executed in a synchronized context (as it doesn't synchronize itself).
269      *
270      * @param classes the set of class names (Strings) to use
271      * @throws ProcessingException if an error occurs while communicating with
272      * the classifier.
273      */
274     private void adjustClasses(final Set classes)
275     throws ProcessingException {
276         // update classes if either the active classes or the instance
277         // suffix have changed
278         if (!classes.equals(currentClasses)
279                 || !instanceSuffix.equals(currentInstanceSuffix)) {
280             // flatten class list, appending instance suffix, if any
281             final String flatClassList = CollUtils.flatten(classes.iterator(),
282                     instanceSuffix + " ") + instanceSuffix;
283             //Util.LOG.debug("Resetting classes to " + flatClassList);
284             // send 'classes' command to classifier + store new classes
285             executeCommand("classes", flatClassList);
286             currentClasses = classes;
287             currentInstanceSuffix = instanceSuffix;
288         }
289     }
290 
291     /***
292      * Classifies an item that is represented by a feature vector by choosing
293      * the most probable class among a set of candidate classes.
294      *
295      * @param features the feature vector to consider
296      * @param candidateClasses an array of the classes that are allowed for
297      * this item
298      * @param context used to store context information to be used for training
299      * @return the result of the classification; you can call
300      * {@link PredictionDistribution#best()} to get the most probably class;
301      * this classifier returns only the best prediction, so
302      * {@link PredictionDistribution#size()} will be 1
303      * @throws ProcessingException if an I/O error occurs during communication
304      * with the external program
305      */
306     protected PredictionDistribution doClassify(final FeatureVector features,
307             final Set candidateClasses, final ContextMap context)
308     throws ProcessingException {
309         final Map<String, String> result;
310 
311         synchronized (guard) {
312             // ensure that classes + features are set correctly
313             adjustClasses(candidateClasses);
314             adjustFeatures(features);
315 
316             // send classify command
317             result = executeCommand("classify", "");
318         }
319 
320         // build prediction distribution: use sigmoid function of pR if
321         // there are two classes, otherwise just store the probability of
322         // the most likely class
323         final PredictionDistribution predDist = new PredictionDistribution();
324         final String winnerClass = stripInstanceSuffix(result.get("class"));
325 
326         if (candidateClasses.size() == 2) {
327             final double pR = Util.asDouble(result.get("pR"));
328             // sigmoid function of pR in range [0.5, 1.0[: will be 0.5
329             // if pR == 0 and increasingly larger for higher absolute scores
330             final double winnerProb = 1.0 / (1.0 + Math.exp(-Math.sqrt(
331                     Math.abs(pR))));
332             final double loserProb = 1.0 - winnerProb;
333             // add prediction for winner class
334             predDist.add(new Prediction(winnerClass,
335                     new Probability(winnerProb)));
336 
337             // and for loser class
338             String loserClass;
339             final Iterator candidateIter = candidateClasses.iterator();
340             do {
341                 loserClass = (String) candidateIter.next();
342             } while (loserClass.equals(winnerClass));
343 
344             predDist.add(new Prediction(loserClass,
345                     new Probability(loserProb)));
346             //Util.LOG.debug("pR: " + pR);
347         } else {
348             predDist.add(new Prediction(winnerClass,
349                     new Probability(Util.asDouble(result.get("prob")))));
350         }
351         //Util.LOG.debug("Predictions: " + predDist.toString());
352 
353         // store reinforcement status
354         context.put(KEY_REINFORCE, result.get(KEY_REINFORCE));
355         return predDist;
356     }
357 
358     /***
359      * {@inheritDoc} <strong>Currently this method is not implemented since
360      * Moonfilter needs to know the list of active classes.</strong>
361      */
362     protected void doTrain(final FeatureVector features,
363             final String targetClass, final ContextMap context)
364     throws ProcessingException {
365         // we cannot support this method because we need to know all
366         // candidate classes for updating
367         throw new UnsupportedOperationException("Moonfilter needs to know the "
368             + "list of active classes -- call trainOnError instead of train");
369     }
370 
371     /***
372      * Helper method than sends a command to the MoonFilter and returns the
373      * result as a hash map. The current implementatation only parses simple
374      * "name=value" pairs correctly (no nested or quoted values) and does not
375      * handle multi-line responses (with backslash-escaped line ends).
376      * This method should always be executed in a synchronized context (as it
377      * doesn't synchronize itself).
378      *
379      * @param command the command to execute
380      * @param params a string containing the parameters for the command, if any
381      * -- may be empty but not <code>null</code>
382      * @return the results (if any) of the command as a map of strings
383      * @throws ProcessingException thrown if the MoonFilter returns "fail"
384      * or does not respond or an I/O error occurs during communication
385      */
386     private Map<String, String> executeCommand(final String command,
387             final String params) throws ProcessingException {
388         final Map<String, String> result = new HashMap<String, String>();
389         try {
390             // send command + read response
391             IOUtils.writeLine(command + " " + params, feedToFilter);
392             feedToFilter.flush();
393             final int bytesRead =
394                 IOUtils.readUntilLineEnd(readFromFilter, answerBuffer);
395             final String answer = new String(answerBuffer, 0, bytesRead);
396             // check that answer starts with "<command> ok"
397             if (OK_MATCHER.reset(answer).find()) {
398                 // parse name=value pairs
399                 NAME_VALUE_MATCHER.reset(answer);
400                 while (NAME_VALUE_MATCHER.find()) {
401                     result.put(NAME_VALUE_MATCHER.group(1),
402                             NAME_VALUE_MATCHER.group(2));
403                 }
404             } else {
405                 // "fail" or other bad answer
406                 throw new ProcessingException("Command '" + command + " "
407                         + params + "' failed: " + answer);
408             }
409         } catch (IOException ioe) {
410             // wrap and rethrow exception
411             throw new ProcessingException("I/O error while executing command",
412                     ioe);
413         }
414         return result;
415     }
416 
417     /***
418      * Initializes the classifier. This method should always be executed in a
419      * synchronized context (as it doesn't synchronize itself).
420      *
421      * @throws ProcessingException if an I/O error occurs during initialization
422      */
423     private void initialize() throws ProcessingException {
424         // start process
425         try {
426             final ProcessBuilder pb = new ProcessBuilder("moonrunner.lua");
427             pb.redirectErrorStream(true); // merge stderr + stdout of the filter
428             if (workDir != null) {
429                 pb.directory(workDir);
430             }
431             moonClassifier = pb.start();
432             feedToFilter =
433                 new OutputStreamWriter(moonClassifier.getOutputStream());
434             readFromFilter = moonClassifier.getInputStream();
435         } catch (IOException ioe) {
436             // wrap and rethrow exception
437             throw new ProcessingException(
438                     "I/O error while initializing classifier", ioe);
439         }
440 
441         // configure any specified params (e.g. number of buckets)
442         final Configuration paramConfig =
443             getConfig().subset("classifier.moon.param");
444         final Iterator keyIter = paramConfig.getKeys();
445         String key, value;
446 
447         while (keyIter.hasNext()) {
448             key = (String) keyIter.next();
449             value = paramConfig.getString(key);
450             Util.LOG.debug("Configuring MoonClassifier: " + key + " = "
451                     + value);
452             executeCommand(key, value);
453         }
454     }
455 
456     /***
457      * {@inheritDoc}
458      */
459     public void reset() throws ProcessingException {
460         synchronized (guard) {
461             if (moonClassifier != null) {
462                 // destroy + re-create class databases
463                 adjustClasses(getAllClasses());
464                 executeCommand("destroy", "");
465                 executeCommand("create", "");
466                 Util.LOG.info("Reset class databases for "
467                         + getAllClasses().toString());
468             } else {
469                 Util.LOG.debug(
470                     "Nothing to reset: wrapped MoonFilter instance is null");
471             }
472         }
473     }
474 
475     /***
476      * Checks whether training is necessary, either due to a misclassification
477      * or for reinforcement.
478      *
479      * @param targetClass the expected class of this feature vector; must be
480      * contained in the set of <code>candidateClasses</code>
481      * @param predDist the prediction distribution returned by
482      * {@link #doClassify(FeatureVector, Set, ContextMap)}
483      * @param context used to transport the reinforcement status returned by
484      * the MoonFilter
485      * @return whether to train this instance
486      */
487     private boolean isTrainingNecessary(final String targetClass,
488             final PredictionDistribution predDist, final ContextMap context) {
489         // training in case of misclassification (normal train-on-error) or if
490         // reinforcement training is requested
491         final boolean result;
492 
493         if (shouldTrain(targetClass, predDist, context)) {
494             result = true;
495             //Util.LOG.debug("Training misclassified or borderline");
496         } else {
497             final String reinforceParam = (String) context.get(KEY_REINFORCE);
498 
499             if ("true".equals(reinforceParam)) {
500                 result = true;
501                 //Util.LOG.debug("Reinforcement training");
502             } else if ("false".equals(reinforceParam)) {
503                 result = false;
504             } else {
505                 // not supposed to happen
506                 throw new IllegalArgumentException(
507                         "Invalid reinforcement parameter: " + reinforceParam);
508             }
509         }
510         return result;
511     }
512 
513     /***
514      * Strips the {@link #instanceSuffix} (if any) from a string, ensuring that
515      * the string does actually end in the suffix.
516      *
517      * @param str the string to convert
518      * @return the string without the instance suffix
519      * @throws IllegalArgumentException if the string does not end in the
520      * instance suffix
521      */
522     private String stripInstanceSuffix(final String str)
523     throws IllegalArgumentException {
524         if (instanceSuffix.length() == 0) {
525             // nothing to strip
526             return str;
527         } else {
528             final int difference = str.length() - instanceSuffix.length();
529             // ensure that string ends in the suffix and contains a prefix
530             if (difference <= 0
531                     || !str.substring(difference).equals(instanceSuffix)) {
532                 throw new IllegalArgumentException("'" + str
533                         + "' does not end in instance suffix '"
534                         + instanceSuffix + "'");
535             }
536 
537             // return the prefix
538             return str.substring(0, difference);
539         }
540     }
541 
542     /***
543      * {@inheritDoc} Currently, this classifier does not support XML
544      * serialization, throwing an {@link UnsupportedOperationException} instead.
545      *
546      * @throws UnsupportedOperationException always thrown by this
547      * implementation
548      */
549     public ObjectElement toElement() throws UnsupportedOperationException {
550             throw new UnsupportedOperationException(
551                     "XML serialization is not supported by MoonClassifier");
552     }
553 
554     /***
555      * {@inheritDoc}
556      */
557     protected boolean trainOnErrorHook(final PredictionDistribution predDist,
558             final FeatureVector features, final String targetClass,
559             final Set candidateClasses, final ContextMap context)
560     throws ProcessingException {
561         if (isTrainingNecessary(targetClass, predDist, context)) {
562             final Map<String, String> result;
563             final String fullTargetClass = targetClass + instanceSuffix;
564 
565             synchronized (guard) {
566                 // ensure that classes + features and are set correctly
567                 adjustClasses(candidateClasses);
568                 adjustFeatures(features);
569 
570                 // send train command
571                 result = executeCommand("train", fullTargetClass);
572             }
573 
574             // count number of misclassifications and reinforcements
575             if (Util.asBoolean(result.get("misclassified"))) {
576                 misclassifications++;
577             }
578             if (Util.asBoolean(result.get("reinforced"))) {
579                 reinforcements++;
580             }
581             //Util.LOG.debug("Trained as " + fullTargetClass);
582         }
583 
584         // signal that we handled the training
585         return true;
586     }
587 
588     /***
589      * Returns a string representation of this object.
590      *
591      * @return a textual representation
592      */
593     public String toString() {
594         final ToStringBuilder builder = new ToStringBuilder(this)
595             .appendSuper(super.toString())
596             .append("instance suffix", instanceSuffix)
597             .append("active classes", currentClasses)
598             .append("misclassifications", misclassifications)
599             .append("reinforcements", reinforcements);
600         return builder.toString();
601     }
602 
603 }