View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.demo;
23  
24  import java.io.File;
25  import java.io.FileWriter;
26  import java.io.StringReader;
27  import java.io.IOException;
28  import java.io.Writer;
29  import java.util.ArrayList;
30  import java.util.Collections;
31  import java.util.HashSet;
32  import java.util.Iterator;
33  import java.util.Set;
34  
35  import org.apache.commons.collections.keyvalue.MultiKey;
36  import org.apache.commons.lang.builder.ToStringBuilder;
37  
38  import de.fu_berlin.ties.ProcessingException;
39  import de.fu_berlin.ties.TiesConfiguration;
40  import de.fu_berlin.ties.classify.Classifier;
41  import de.fu_berlin.ties.classify.ClassTrain;
42  import de.fu_berlin.ties.classify.PredictionDistribution;
43  import de.fu_berlin.ties.classify.TrainableClassifier;
44  import de.fu_berlin.ties.classify.feature.FeatureVector;
45  import de.fu_berlin.ties.classify.winnow.Winnow;
46  import de.fu_berlin.ties.io.IOUtils;
47  import de.fu_berlin.ties.text.TokenizingExtractor;
48  import de.fu_berlin.ties.util.Util;
49  
50  /***
51   * Instances of this class can be used to demonstrate the how statistical spam
52   * filtering works. This class supports only the
53   * {@link de.fu_berlin.ties.classify.winnow.Winnow} classifier and subclasses.
54   *
55   * @author Christian Siefkes
56   * @version $Revision: 1.18 $, $Date: 2006/10/21 16:04:09 $, $Author: siefkes $
57   */
58  public class SpamFilterDemo {
59  
60      /***
61       * Name of the spam class: {@value}.
62       */
63      public static final String CLASS_SPAM = "spam";
64  
65      /***
66       * Name of the nonspam (ham) class: {@value}.
67       */
68      public static final String CLASS_NONSPAM = "nonspam";
69  
70      /***
71       * Main method for testing.
72       *
73       * @param args the command-line arguments (ignored)
74       * @throws IOException if an I/O error occurs
75       * @throws ProcessingException if an error orrurs while processing the tasks
76       */
77      public static void main(final String[] args)
78      throws IOException, ProcessingException {
79          final SpamFilterDemo filterDemo = new SpamFilterDemo(
80                  "/home/datsche/siefkes/lib/filterdemo/lange-nacht-training.zip",
81                  "/home/datsche/siefkes/lib/filterdemo/lange-nacht-testing.zip");
82          final SampleMails trainingMails = filterDemo.getTrainingSet();
83          Util.LOG.info("Training set: " + trainingMails.spamCount()
84                  + "/" + trainingMails.nonspamCount());
85          final SampleMails testMails = filterDemo.getTestSet();
86          Util.LOG.info("Test set: " + testMails.spamCount() + "/"
87                  + testMails.nonspamCount());
88  
89          final String[] nonspamSubs = testMails.nonspamSubjects();
90          final int testMessagePos = nonspamSubs.length / 2;
91          Util.LOG.info("Will test representation of nonspam #" + testMessagePos
92                  + ": " + nonspamSubs[testMessagePos]);
93          final String testMessage = testMails.getNonspam(testMessagePos);
94          FilterResult result = filterDemo.classify(testMessage);
95          Util.LOG.info("Classification result: " + result);
96          final File dumpFile = new File(IOUtils.userHome()
97                  + File.separator + "new", "dump.html");
98          final Writer dumper = new FileWriter(dumpFile);
99          try {
100             result.writeTestHTML(dumper);
101         } finally {
102             IOUtils.tryToClose(dumper);
103         }
104         Util.LOG.info("Dumped visual representation to " + dumpFile);
105 
106 
107 /*        // check accuracy on test set
108         FilterResult result;
109         int i;
110         int spamErrors = 0;
111         final String[] spamSubs = testMails.spamSubjects();
112 
113         for (i = 0; i < spamSubs.length; i++) {
114             result = filterDemo.classify(testMails.getSpam(i));
115             if (!result.getPredictedClass().equals(CLASS_SPAM)) {
116                 Util.LOG.info("Misclassified spam " + i + " (" + spamSubs[i]
117                         + "): " + result);
118                 spamErrors++;
119             } else {
120                 Util.LOG.debug("Correctly classified spam " + i + " ("
121                         + spamSubs[i] + "): " + result);
122             }
123         }
124         Util.LOG.info("Misclassified " + spamErrors + " of " + spamSubs.length
125                 + " spam mails");
126 
127         int nonspamErrors = 0;
128 
129         for (i = 0; i < nonspamSubs.length; i++) {
130             result = filterDemo.classify(testMails.getNonspam(i));
131             if (!result.getPredictedClass().equals(CLASS_NONSPAM)) {
132                 Util.LOG.info("Misclassified nonspam " + i + " ("
133                         + nonspamSubs[i] + "): " + result);
134                 nonspamErrors++;
135             } else {
136                 Util.LOG.debug("Correctly classified nonspam " + i + " ("
137                         + nonspamSubs[i] + "): " + result);
138             }
139         }
140         Util.LOG.info("Misclassified " + nonspamErrors + " of "
141                 + nonspamSubs.length + " nonspam mails"); */
142     }
143 
144 
145     /***
146      * The set of mails used for testing.
147      */
148     private final SampleMails testSet;
149 
150     /***
151      * The set of mails used for training.
152      */
153     private final SampleMails trainingSet;
154 
155     /***
156      * The classifier used by this instance.
157      */
158     private final Winnow classifier;
159 
160     /***
161      * Used to convert text sequences into feature vectors.
162      * Synchronized on itself.
163      */
164     private final TokenizingExtractor featureExtractor;
165 
166     /***
167      * Creates a new instance. The ZIP files must follow the
168      * {@link SampleMails} conventions.
169      *
170      * @param trainingSetFile a ZIP file containing the mails used for training
171      * @param testSetFile a ZIP file containing the mails used for testing
172      * @throws IOException if one of the files cannot be read or is not a
173      * valid ZIP file
174      * @throws ProcessingException if an error occurs while initializing the
175      * classifier
176      */
177     public SpamFilterDemo(final String trainingSetFile,
178             final String testSetFile) throws IOException, ProcessingException {
179         this(new File(trainingSetFile), new File(testSetFile));
180     }
181 
182     /***
183      * Creates a new instance. The ZIP files must follow the
184      * {@link SampleMails} conventions.
185      *
186      * @param trainingSetFile a ZIP file containing the mails used for training
187      * @param testSetFile a ZIP file containing the mails used for testing
188      * @throws IOException if one of the files cannot be read or is not a
189      * valid ZIP file
190      * @throws ProcessingException if an error occurs while initializing the
191      * classifier
192      */
193     public SpamFilterDemo(final File trainingSetFile,
194             final File testSetFile) throws IOException, ProcessingException {
195         this(new SampleMails(trainingSetFile), new SampleMails(testSetFile));
196     }
197 
198     /***
199      * Creates a new instance.
200      *
201      * @param myTrainingSet the set of mails used for training
202      * @param myTestSet the set of mails used for testing
203      * @throws ProcessingException if an error occurs while initializing the
204      * classifier
205      * @throws IOException if an I/O error occurs
206      */
207     public SpamFilterDemo(final SampleMails myTrainingSet,
208             final SampleMails myTestSet)
209     throws ProcessingException, IOException {
210         super();
211         trainingSet = myTrainingSet;
212         testSet = myTestSet;
213 
214         // use separate models for each classifier
215         TiesConfiguration.CONF.setProperty("classifier.winnow.shared-store",
216                 false);
217 
218         // initialize classifier and extractor
219         final Set<String> classSet = new HashSet<String>(2);
220         classSet.add(CLASS_NONSPAM);
221         classSet.add(CLASS_SPAM);
222         final TrainableClassifier myClassifier =
223             TrainableClassifier.createClassifier(classSet,
224                 TiesConfiguration.CONF, ClassTrain.CONFIG_SUFFIX_TEXT);
225         if (myClassifier instanceof Winnow) {
226             classifier = (Winnow) myClassifier;
227         } else {
228             throw new IllegalArgumentException(
229                     "Only Winnow-based classifiers are supported");
230         }
231         featureExtractor = new TokenizingExtractor(TiesConfiguration.CONF,
232                 Classifier.CONFIG_CLASSIFIER);
233 
234         // load initial classification model
235         reloadModel();
236     }
237 
238 
239     /***
240      * Helper method for extracting features from text.
241      *
242      * @param text the text to process
243      * @return a feature vector representing the text
244      * @throws IOException if an I/O error occurs
245      */
246     private FeatureVector buildFeatures(final String text) throws IOException {
247         // synchronize on the extractor to avoid collisions
248         synchronized (featureExtractor) {
249             return featureExtractor.buildFeatures(new StringReader(text));
250         }
251     }
252 
253     /***
254      * Classifies a text.
255      *
256      * @param text the text to train
257      * @return a {@link FilterResult} containing detailed results of
258      * the classification
259      * @throws ProcessingException if an error occurs during classification
260      * @throws IOException if an I/O error occurs
261      */
262     public FilterResult classify(final String text)
263      throws ProcessingException, IOException {
264         final FeatureVector features = buildFeatures(text);
265         final PredictionDistribution predDist =
266             classifier.classify(features, classifier.getAllClasses());
267         final FilterResult result = new FilterResult(predDist, text,
268                 featureExtractor, classifier.showFeatureWeights(features));
269         return result;
270     }
271 
272     /***
273      * Completely resets the internal classification model. After a reset, the
274      * classifier will have no idea how "spam" or "nonspam" messages typically
275      * look like.
276      *
277      * @throws ProcessingException if an error occurs during reset
278      */
279     public void clearModel() throws ProcessingException {
280         classifier.reset();
281     }
282 
283     /***
284      * Returns the set of mails used for testing.
285      * @return the value of the attribute
286      */
287     public SampleMails getTestSet() {
288         return testSet;
289     }
290 
291     /***
292      * Returns the set of mails used for training.
293      * @return the value of the attribute
294      */
295     public SampleMails getTrainingSet() {
296         return trainingSet;
297     }
298 
299     /***
300      * Reloads the inital state of the internal classification model.
301      * The model is {@link #clearModel() cleared} and then re-trained from the
302      * sample mails contained the {@linkplain #getTrainingSet() training set}
303      * (shuffled in pseudo-random order).
304      *
305      * @throws ProcessingException if an error occurs during reset
306      * @throws IOException if an I/O error occurs
307      */
308     public void reloadModel() throws ProcessingException, IOException {
309         // delete current model
310         clearModel();
311 
312         // combine spam + nonspam mails in single list
313         final ArrayList<MultiKey> samples = new ArrayList<MultiKey>(
314                 trainingSet.spamCount() + trainingSet.nonspamCount());
315         MultiKey key;
316         int i;
317 
318         for (i = 0; i < trainingSet.spamCount(); i++) {
319             key = new MultiKey(CLASS_SPAM, Integer.valueOf(i));
320             samples.add(key);
321         }
322         for (i = 0; i < trainingSet.nonspamCount(); i++) {
323             key = new MultiKey(CLASS_NONSPAM, Integer.valueOf(i));
324             samples.add(key);
325         }
326 
327         // pseudo-randomly shuffle the list in a reproducable way
328         Collections.shuffle(samples, Util.reproducibleRandom());
329 
330         // iterate shuffles list and train all instances
331         final Iterator<MultiKey> sampleIter = samples.iterator();
332         Object type;
333         int pos;
334 
335         while (sampleIter.hasNext()) {
336             key = sampleIter.next();
337             type = key.getKey(0);
338             pos = (Integer) key.getKey(1);
339             String message;
340 
341             if (type == CLASS_SPAM) {
342                 // train as spam
343                 Util.LOG.debug("Training spam sample " + pos);
344                 message = trainingSet.getSpam(pos);
345                 trainSpam(message);
346             } else if (type == CLASS_NONSPAM) {
347                 // train as nonspam
348                 Util.LOG.debug("Training nonspam sample " + pos);
349                 message = trainingSet.getNonspam(pos);
350                 trainNonspam(message);
351             } else {
352                 // not supposed to happen
353                 throw new RuntimeException(
354                         "Implementation error: unexpected type" + type);
355             }
356         }
357     }
358 
359     /***
360      * Returns a string representation of this object.
361      *
362      * @return a textual representation
363      */
364     public String toString() {
365         return new ToStringBuilder(this)
366             .append("classifier", classifier)
367             .append("feature extractor", featureExtractor)
368             .append("training set", trainingSet)
369             .append("test set", testSet)
370             .toString();
371     }
372 
373     /***
374      * Trains a text as ham.
375      *
376      * @param text the text to train
377      * @throws ProcessingException if an error occurs during training
378      * @throws IOException if an I/O error occurs
379      */
380     public void trainNonspam(final String text)
381     throws ProcessingException, IOException {
382         final FeatureVector features = buildFeatures(text);
383         classifier.trainOnError(features, CLASS_NONSPAM,
384                 classifier.getAllClasses());
385     }
386 
387     /***
388      * Trains a text as spam.
389      *
390      * @param text the text to train
391      * @throws ProcessingException if an error occurs during training
392      * @throws IOException if an I/O error occurs
393      */
394     public void trainSpam(final String text)
395     throws ProcessingException, IOException {
396         final FeatureVector features = buildFeatures(text);
397         classifier.trainOnError(features, CLASS_SPAM,
398                 classifier.getAllClasses());
399     }
400 
401 }