View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.classify;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.util.List;
27  
28  import org.apache.commons.lang.builder.ToStringBuilder;
29  
30  import de.fu_berlin.ties.CollectingProcessor;
31  import de.fu_berlin.ties.ContextMap;
32  import de.fu_berlin.ties.ProcessingException;
33  import de.fu_berlin.ties.TiesConfiguration;
34  import de.fu_berlin.ties.classify.feature.FeatureExtractor;
35  import de.fu_berlin.ties.classify.feature.FeatureExtractorFactory;
36  import de.fu_berlin.ties.classify.feature.FeatureVector;
37  import de.fu_berlin.ties.io.IOUtils;
38  import de.fu_berlin.ties.util.CollUtils;
39  
40  /***
41   * A text filter provides a simple API for classifying text files. All instances
42   * of this class share a common
43   * {@link de.fu_berlin.ties.classify.TrainableClassifier} which will be
44   * initialized by the first created instance. This class it meant to be
45   * used with <a href="http://www.martiansoftware.com/nailgun/">NailGun</a> to
46   * avoid the cost of creating the virtual machine and to allow re-using the
47   * same classifier instance between multiple calls.
48   *
49   * <p>The classifier is configured from the provided
50   * configuration, using the
51   * {@link de.fu_berlin.ties.classify.ClassTrain#CONFIG_SUFFIX_TEXT} suffix
52   * to allow text-specific settings. The classes to consider for classification
53   * are read from the {@link TextFilter#CONFIG_CLASSES} parameter.
54   * The probability of the very first class will be returned as "score".
55   *
56   * <p>This class is meant to be invoked on the command line as "filter" goal to
57   * classify or train a text file. It supports two commands, "classify" and
58   * "train":
59   * <dl>
60   * <dt>classify FILENAME<dt>
61   * <dd>Classifies the given file, writing a single line of output to
62   * {@link System#out}:<br>
63   * <tt>class=PREDICTED-CLASS score=PROB.-OF-FIRST-CLASS
64   * prob==PROB.-OF-PREDICTED-CLASS</tt></dd>
65   * <dt>train TRUE-CLASS FILENAME<dt>
66   * <dd>Trains the given file as TRUE-CLASS (must be one of the configured
67   * classes).</dd>
68   * </dl>
69   *
70   * <p>Additional parameters after the required arguments are allowed but
71   * ignored; other commands will be treated as errors.
72   *
73   * @author Christian Siefkes
74   * @version $Revision: 1.7 $, $Date: 2006/10/21 16:03:55 $, $Author: siefkes $
75   */
76  public class TextFilter extends CollectingProcessor {
77  
78      /***
79       * Configuration key: Names of the classes used to filter text. 
80       * The probability of the very first class will be returned as "score".
81       */
82      public static final String CONFIG_CLASSES = "textfilter.classes";
83  
84      /***
85       * Used to guard access to the classifier.
86       */
87      private static final Object GUARD = new Object();
88  
89      /***
90       * Classifier shared by all instances of this object.
91       */
92      private static TrainableClassifier classifier = null;
93  
94      /***
95       * Used to convert text sequences into feature vectors.
96       */
97      private static FeatureExtractor featureExt;
98  
99      /***
100      * The class whose score (probability) should be output.
101      */
102     private static String scoreClass = null;
103 
104     /***
105      * Used to cache the last generated feature vector between successive calls.
106      */
107     private static FeatureVector cachedFeatures = null;
108 
109     /***
110      * Used to cache the last generated feature vector between successive calls.
111      */
112     private static String cachedAbsPath = "";
113 
114     /***
115      * Used to cache the last generated feature vector between successive calls.
116      */
117     private static long cachedLastModified = -1;
118 
119     /***
120      * Used to cache the last generated feature vector between successive calls.
121      */
122     private static long cachedLength = -1;
123 
124 
125     /***
126      * Creates a new instance, configured from the
127      * {@linkplain TiesConfiguration#CONF default configuration}.
128      *
129      * @throws ProcessingException if the configured classifier instance cannot
130      * be instantiated
131      */
132     public TextFilter() throws ProcessingException {
133         this(TiesConfiguration.CONF);
134     }
135 
136     /***
137      * Creates a new instance. 
138      *
139      * @param conf used to configure this instance
140      * @throws ProcessingException if the configured classifier instance cannot
141      * be instantiated
142      */
143     public TextFilter(final TiesConfiguration conf) throws ProcessingException {
144         super(conf);
145 
146         // initialize shared classifier and related fields if not yet done
147         synchronized (GUARD) {
148             if (scoreClass == null) {
149                 final String[] classNames = conf.getStringArray(CONFIG_CLASSES);
150 
151                 if (classNames.length < 2) {
152                     throw new IllegalArgumentException(
153                             "Not enough classes to filter text: "
154                             + classNames.length);
155                 }
156 
157                 scoreClass = classNames[0];
158                 featureExt = FeatureExtractorFactory.createExtractor(
159                         conf, Classifier.CONFIG_CLASSIFIER);
160                 classifier = TrainableClassifier.createClassifier(
161                         CollUtils.arrayAsSet(classNames), conf,
162                         ClassTrain.CONFIG_SUFFIX_TEXT);
163             }
164         }
165     }
166 
167     /***
168      * Helper method for extracting features from a text file. Caches and
169      * re-uses features from the last call to this method if possible. This
170      * method is meant to be evoked in a synchronized context, if necessary;
171      * it is not synchronized by itself.
172      *
173      * @param file the file to process
174      * @return a feature vector representing the file contents
175      * @throws IOException if an I/O error occurs
176      * @throws ProcessingException if an error occurs while processing the text
177      */
178     private FeatureVector buildFeatures(final File file)
179     throws IOException, ProcessingException {
180         final FeatureVector result;
181         final String absolutePath = file.getAbsolutePath();
182         final long lastModified = file.lastModified();
183         final long length = file.length();
184 
185         // re-use cached feature vector if possible
186         if (absolutePath.equals(cachedAbsPath)
187                 && (cachedLastModified == lastModified)
188                 && (cachedLength == length)) {
189             // it's the same file with unchanged length and last-modified date
190             result = cachedFeatures;
191         } else {
192             result = featureExt.buildFeatures(
193                     IOUtils.openReader(file, getConfig()));
194 
195             // cache to allow re-use
196             cachedFeatures = result;
197             cachedAbsPath = absolutePath;
198             cachedLastModified = lastModified;
199             cachedLength = length;
200         }
201 
202         return result;
203     }
204 
205     /***
206      * {@inheritDoc}
207      */
208     public void process(final List<String> collected, final ContextMap context)
209             throws IOException, ProcessingException {
210         // check command to execute (first argument
211         if (collected.isEmpty()) {
212             throw new IllegalArgumentException("Missing command");
213         }
214         final String command = collected.get(0);
215         final String filename;
216         final FeatureVector features;
217 
218         if ("classify".equals(command)) {
219             // classify FILENAME
220             if (collected.size() < 2) {
221                 throw new IllegalArgumentException(
222                         "Not enough arguments for classify command");
223             }
224             filename = collected.get(1);
225 
226             synchronized (GUARD) {
227                 features = buildFeatures(new File(filename));
228                 final PredictionDistribution predDist =
229                     classifier.classify(features, classifier.getAllClasses());
230 
231                 // name and probability of best class
232                 final Prediction bestPred = predDist.best();
233                 final String predictedClass = bestPred.getType();
234                 final double bestProb = bestPred.getProbability().getProb();
235 
236                 // probability of "score" class (first in configured list)
237                 final Prediction scorePred =
238                     predDist.getPredictionForType(scoreClass);
239                 final double scoreProb = scorePred.getProbability().getProb();
240 
241                 // output: class=PREDICTED-CLASS score=PROB.-OF-FIRST-CLASS
242                 // prob==PROB.-OF-PREDICTED-CLASS
243                 final StringBuilder output = new StringBuilder("class=")
244                     .append(predictedClass)
245                     .append(" score=")
246                     .append(scoreProb)
247                     .append(" prob=")
248                     .append(bestProb);
249                 System.out.println(output.toString());
250             }
251         } else if ("train".equals(command)) {
252             // train TRUE-CLASS FILENAME
253             if (collected.size() < 3) {
254                 throw new IllegalArgumentException(
255                         "Not enough arguments for train command");
256             }
257             final String trueClass = collected.get(1);
258             filename = collected.get(2);
259 
260             synchronized (GUARD) {
261                 features = buildFeatures(new File(filename));
262                 classifier.trainOnError(features, trueClass,
263                         classifier.getAllClasses());
264             }
265         } else {
266             throw new IllegalArgumentException("Unknown command: " + command);
267         }
268     }
269 
270     /***
271      * Returns a string representation of this object.
272      *
273      * @return a textual representation
274      */
275     public String toString() {
276         return new ToStringBuilder(this)
277             .append("classifier", classifier)
278             .toString();
279     }
280 
281 }