1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.demo;
23
24 import java.io.File;
25 import java.io.FileWriter;
26 import java.io.StringReader;
27 import java.io.IOException;
28 import java.io.Writer;
29 import java.util.ArrayList;
30 import java.util.Collections;
31 import java.util.HashSet;
32 import java.util.Iterator;
33 import java.util.Set;
34
35 import org.apache.commons.collections.keyvalue.MultiKey;
36 import org.apache.commons.lang.builder.ToStringBuilder;
37
38 import de.fu_berlin.ties.ProcessingException;
39 import de.fu_berlin.ties.TiesConfiguration;
40 import de.fu_berlin.ties.classify.Classifier;
41 import de.fu_berlin.ties.classify.ClassTrain;
42 import de.fu_berlin.ties.classify.PredictionDistribution;
43 import de.fu_berlin.ties.classify.TrainableClassifier;
44 import de.fu_berlin.ties.classify.feature.FeatureVector;
45 import de.fu_berlin.ties.classify.winnow.Winnow;
46 import de.fu_berlin.ties.io.IOUtils;
47 import de.fu_berlin.ties.text.TokenizingExtractor;
48 import de.fu_berlin.ties.util.Util;
49
50 /***
51 * Instances of this class can be used to demonstrate the how statistical spam
52 * filtering works. This class supports only the
53 * {@link de.fu_berlin.ties.classify.winnow.Winnow} classifier and subclasses.
54 *
55 * @author Christian Siefkes
56 * @version $Revision: 1.18 $, $Date: 2006/10/21 16:04:09 $, $Author: siefkes $
57 */
58 public class SpamFilterDemo {
59
60 /***
61 * Name of the spam class: {@value}.
62 */
63 public static final String CLASS_SPAM = "spam";
64
65 /***
66 * Name of the nonspam (ham) class: {@value}.
67 */
68 public static final String CLASS_NONSPAM = "nonspam";
69
70 /***
71 * Main method for testing.
72 *
73 * @param args the command-line arguments (ignored)
74 * @throws IOException if an I/O error occurs
75 * @throws ProcessingException if an error orrurs while processing the tasks
76 */
77 public static void main(final String[] args)
78 throws IOException, ProcessingException {
79 final SpamFilterDemo filterDemo = new SpamFilterDemo(
80 "/home/datsche/siefkes/lib/filterdemo/lange-nacht-training.zip",
81 "/home/datsche/siefkes/lib/filterdemo/lange-nacht-testing.zip");
82 final SampleMails trainingMails = filterDemo.getTrainingSet();
83 Util.LOG.info("Training set: " + trainingMails.spamCount()
84 + "/" + trainingMails.nonspamCount());
85 final SampleMails testMails = filterDemo.getTestSet();
86 Util.LOG.info("Test set: " + testMails.spamCount() + "/"
87 + testMails.nonspamCount());
88
89 final String[] nonspamSubs = testMails.nonspamSubjects();
90 final int testMessagePos = nonspamSubs.length / 2;
91 Util.LOG.info("Will test representation of nonspam #" + testMessagePos
92 + ": " + nonspamSubs[testMessagePos]);
93 final String testMessage = testMails.getNonspam(testMessagePos);
94 FilterResult result = filterDemo.classify(testMessage);
95 Util.LOG.info("Classification result: " + result);
96 final File dumpFile = new File(IOUtils.userHome()
97 + File.separator + "new", "dump.html");
98 final Writer dumper = new FileWriter(dumpFile);
99 try {
100 result.writeTestHTML(dumper);
101 } finally {
102 IOUtils.tryToClose(dumper);
103 }
104 Util.LOG.info("Dumped visual representation to " + dumpFile);
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142 }
143
144
145 /***
146 * The set of mails used for testing.
147 */
148 private final SampleMails testSet;
149
150 /***
151 * The set of mails used for training.
152 */
153 private final SampleMails trainingSet;
154
155 /***
156 * The classifier used by this instance.
157 */
158 private final Winnow classifier;
159
160 /***
161 * Used to convert text sequences into feature vectors.
162 * Synchronized on itself.
163 */
164 private final TokenizingExtractor featureExtractor;
165
166 /***
167 * Creates a new instance. The ZIP files must follow the
168 * {@link SampleMails} conventions.
169 *
170 * @param trainingSetFile a ZIP file containing the mails used for training
171 * @param testSetFile a ZIP file containing the mails used for testing
172 * @throws IOException if one of the files cannot be read or is not a
173 * valid ZIP file
174 * @throws ProcessingException if an error occurs while initializing the
175 * classifier
176 */
177 public SpamFilterDemo(final String trainingSetFile,
178 final String testSetFile) throws IOException, ProcessingException {
179 this(new File(trainingSetFile), new File(testSetFile));
180 }
181
182 /***
183 * Creates a new instance. The ZIP files must follow the
184 * {@link SampleMails} conventions.
185 *
186 * @param trainingSetFile a ZIP file containing the mails used for training
187 * @param testSetFile a ZIP file containing the mails used for testing
188 * @throws IOException if one of the files cannot be read or is not a
189 * valid ZIP file
190 * @throws ProcessingException if an error occurs while initializing the
191 * classifier
192 */
193 public SpamFilterDemo(final File trainingSetFile,
194 final File testSetFile) throws IOException, ProcessingException {
195 this(new SampleMails(trainingSetFile), new SampleMails(testSetFile));
196 }
197
198 /***
199 * Creates a new instance.
200 *
201 * @param myTrainingSet the set of mails used for training
202 * @param myTestSet the set of mails used for testing
203 * @throws ProcessingException if an error occurs while initializing the
204 * classifier
205 * @throws IOException if an I/O error occurs
206 */
207 public SpamFilterDemo(final SampleMails myTrainingSet,
208 final SampleMails myTestSet)
209 throws ProcessingException, IOException {
210 super();
211 trainingSet = myTrainingSet;
212 testSet = myTestSet;
213
214
215 TiesConfiguration.CONF.setProperty("classifier.winnow.shared-store",
216 false);
217
218
219 final Set<String> classSet = new HashSet<String>(2);
220 classSet.add(CLASS_NONSPAM);
221 classSet.add(CLASS_SPAM);
222 final TrainableClassifier myClassifier =
223 TrainableClassifier.createClassifier(classSet,
224 TiesConfiguration.CONF, ClassTrain.CONFIG_SUFFIX_TEXT);
225 if (myClassifier instanceof Winnow) {
226 classifier = (Winnow) myClassifier;
227 } else {
228 throw new IllegalArgumentException(
229 "Only Winnow-based classifiers are supported");
230 }
231 featureExtractor = new TokenizingExtractor(TiesConfiguration.CONF,
232 Classifier.CONFIG_CLASSIFIER);
233
234
235 reloadModel();
236 }
237
238
239 /***
240 * Helper method for extracting features from text.
241 *
242 * @param text the text to process
243 * @return a feature vector representing the text
244 * @throws IOException if an I/O error occurs
245 */
246 private FeatureVector buildFeatures(final String text) throws IOException {
247
248 synchronized (featureExtractor) {
249 return featureExtractor.buildFeatures(new StringReader(text));
250 }
251 }
252
253 /***
254 * Classifies a text.
255 *
256 * @param text the text to train
257 * @return a {@link FilterResult} containing detailed results of
258 * the classification
259 * @throws ProcessingException if an error occurs during classification
260 * @throws IOException if an I/O error occurs
261 */
262 public FilterResult classify(final String text)
263 throws ProcessingException, IOException {
264 final FeatureVector features = buildFeatures(text);
265 final PredictionDistribution predDist =
266 classifier.classify(features, classifier.getAllClasses());
267 final FilterResult result = new FilterResult(predDist, text,
268 featureExtractor, classifier.showFeatureWeights(features));
269 return result;
270 }
271
272 /***
273 * Completely resets the internal classification model. After a reset, the
274 * classifier will have no idea how "spam" or "nonspam" messages typically
275 * look like.
276 *
277 * @throws ProcessingException if an error occurs during reset
278 */
279 public void clearModel() throws ProcessingException {
280 classifier.reset();
281 }
282
283 /***
284 * Returns the set of mails used for testing.
285 * @return the value of the attribute
286 */
287 public SampleMails getTestSet() {
288 return testSet;
289 }
290
291 /***
292 * Returns the set of mails used for training.
293 * @return the value of the attribute
294 */
295 public SampleMails getTrainingSet() {
296 return trainingSet;
297 }
298
299 /***
300 * Reloads the inital state of the internal classification model.
301 * The model is {@link #clearModel() cleared} and then re-trained from the
302 * sample mails contained the {@linkplain #getTrainingSet() training set}
303 * (shuffled in pseudo-random order).
304 *
305 * @throws ProcessingException if an error occurs during reset
306 * @throws IOException if an I/O error occurs
307 */
308 public void reloadModel() throws ProcessingException, IOException {
309
310 clearModel();
311
312
313 final ArrayList<MultiKey> samples = new ArrayList<MultiKey>(
314 trainingSet.spamCount() + trainingSet.nonspamCount());
315 MultiKey key;
316 int i;
317
318 for (i = 0; i < trainingSet.spamCount(); i++) {
319 key = new MultiKey(CLASS_SPAM, Integer.valueOf(i));
320 samples.add(key);
321 }
322 for (i = 0; i < trainingSet.nonspamCount(); i++) {
323 key = new MultiKey(CLASS_NONSPAM, Integer.valueOf(i));
324 samples.add(key);
325 }
326
327
328 Collections.shuffle(samples, Util.reproducibleRandom());
329
330
331 final Iterator<MultiKey> sampleIter = samples.iterator();
332 Object type;
333 int pos;
334
335 while (sampleIter.hasNext()) {
336 key = sampleIter.next();
337 type = key.getKey(0);
338 pos = (Integer) key.getKey(1);
339 String message;
340
341 if (type == CLASS_SPAM) {
342
343 Util.LOG.debug("Training spam sample " + pos);
344 message = trainingSet.getSpam(pos);
345 trainSpam(message);
346 } else if (type == CLASS_NONSPAM) {
347
348 Util.LOG.debug("Training nonspam sample " + pos);
349 message = trainingSet.getNonspam(pos);
350 trainNonspam(message);
351 } else {
352
353 throw new RuntimeException(
354 "Implementation error: unexpected type" + type);
355 }
356 }
357 }
358
359 /***
360 * Returns a string representation of this object.
361 *
362 * @return a textual representation
363 */
364 public String toString() {
365 return new ToStringBuilder(this)
366 .append("classifier", classifier)
367 .append("feature extractor", featureExtractor)
368 .append("training set", trainingSet)
369 .append("test set", testSet)
370 .toString();
371 }
372
373 /***
374 * Trains a text as ham.
375 *
376 * @param text the text to train
377 * @throws ProcessingException if an error occurs during training
378 * @throws IOException if an I/O error occurs
379 */
380 public void trainNonspam(final String text)
381 throws ProcessingException, IOException {
382 final FeatureVector features = buildFeatures(text);
383 classifier.trainOnError(features, CLASS_NONSPAM,
384 classifier.getAllClasses());
385 }
386
387 /***
388 * Trains a text as spam.
389 *
390 * @param text the text to train
391 * @throws ProcessingException if an error occurs during training
392 * @throws IOException if an I/O error occurs
393 */
394 public void trainSpam(final String text)
395 throws ProcessingException, IOException {
396 final FeatureVector features = buildFeatures(text);
397 classifier.trainOnError(features, CLASS_SPAM,
398 classifier.getAllClasses());
399 }
400
401 }