1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.demo;
23
24 import java.io.IOException;
25 import java.io.Writer;
26 import java.util.Iterator;
27 import java.util.List;
28 import java.util.Map;
29
30 import org.apache.commons.lang.StringEscapeUtils;
31 import org.apache.commons.lang.builder.ToStringBuilder;
32 import org.apache.velocity.exception.VelocityException;
33
34 import de.fu_berlin.ties.ContextMap;
35 import de.fu_berlin.ties.classify.Prediction;
36 import de.fu_berlin.ties.classify.PredictionDistribution;
37 import de.fu_berlin.ties.classify.winnow.WinnowPrediction;
38 import de.fu_berlin.ties.text.TextTokenizer;
39 import de.fu_berlin.ties.text.TextUtils;
40 import de.fu_berlin.ties.text.TokenizingExtractor;
41 import de.fu_berlin.ties.util.VelocityService;
42
43 /***
44 * Instances of this class contain detailed filtering results with
45 * additional debugging information. This class supports only the
46 * {@link de.fu_berlin.ties.classify.winnow.Winnow} classifier and subclasses.
47 *
48 * @author Christian Siefkes
49 * @version $Revision: 1.14 $, $Date: 2006/10/21 16:04:09 $, $Author: siefkes $
50 */
51 public class FilterResult {
52
53 /***
54 * ID (prefix) used for the feature scores table and child elements.
55 */
56 private static final String FEATURE_SCORE_ID = "featureScoreTable";
57
58
59 /***
60 * Result returned by the classifier.
61 */
62 private final PredictionDistribution predDist;
63
64 /***
65 * Used to convert text sequences into feature vectors.
66 * Synchronized on itself.
67 */
68 private final TokenizingExtractor featureExtractor;
69
70 /***
71 * A mapping from feature representation to weights arrays, as returned by
72 * the {@link de.fu_berlin.ties.classify.winnow.Winnow#showFeatureWeights}
73 * method.
74 */
75 private final Map<String, List<Float>> featureWeights;
76
77 /***
78 * The filtered text.
79 */
80 private final String text;
81
82
83 /***
84 * Creates a new instance.
85 *
86 * @param myPredDist the prediction distribution wrapped by this instance
87 * @param myText the filtered text
88 * @param extractor the feature extractor used to tokenize text sequences
89 * -- will be synchronized on itself
90 * @param myFeatureWeights a mapping from feature representation to weights
91 * arrays, as returned by the
92 * {@link de.fu_berlin.ties.classify.winnow.Winnow#showFeatureWeights}
93 * method
94 */
95 public FilterResult(final PredictionDistribution myPredDist,
96 final String myText,
97 final TokenizingExtractor extractor,
98 final Map<String, List<Float>> myFeatureWeights) {
99 super();
100
101 if (myPredDist.size() < 1) {
102 throw new IllegalArgumentException("Empty prediction distribution");
103 }
104
105 predDist = myPredDist;
106 text = myText;
107 featureExtractor = extractor;
108 featureWeights = myFeatureWeights;
109 }
110
111
112 /***
113 * Returns the predicted class: "spam" or "nonspam".
114 *
115 * @return the predicted class
116 */
117 public String getPredictedClass() {
118 return predDist.best().getType();
119 }
120
121 /***
122 * Returns the probability of the most likely class.
123 *
124 * @return the probability of the predicted class, a number in the in range
125 * from 0.0 to 1.0
126 */
127 public double getProbability() {
128 return predDist.best().getProbability().getProb();
129 }
130
131 /***
132 * Returns the {@linkplain
133 * de.fu_berlin.ties.classify.winnow.Winnow#normalizeScore normalized score}
134 * for the <em>nonspam</em> class. "Good" scores are > 1, "bad" scores
135 * are in the <code>]0, 1[</code> rande and 1.0 means "don't know".
136 *
137 * @return the score for the nonspam class
138 */
139 public float getNonspamScore() {
140 return getScoreForType(SpamFilterDemo.CLASS_NONSPAM);
141 }
142
143 /***
144 * Helper method that returns the {@linkplain
145 * de.fu_berlin.ties.classify.winnow.Winnow#normalizeScore normalized score}
146 * of a predicted type. This will only work for
147 * {@linkplain de.fu_berlin.ties.classify.winnow.Winnow Winnow}-based
148 * classifiers. Otherwise, or if the predicted type does not exist in the
149 * probability distribution, {@linkplain Float#NaN Not-a-Number} is
150 * returned.
151 *
152 * @param type the type to look up
153 * @return the normalized score of the given type, if applicable
154 */
155 private float getScoreForType(final String type) {
156 final float result;
157 Iterator<Prediction> predIter = predDist.iterator();
158 Prediction currentPred;
159 Prediction typedPred = null;
160
161
162 while (predIter.hasNext() && (typedPred == null)) {
163 currentPred = predIter.next();
164 if (currentPred.getType().equals(type)) {
165
166 typedPred = currentPred;
167 }
168 }
169
170 if ((typedPred != null) && (typedPred instanceof WinnowPrediction)) {
171
172 final WinnowPrediction winnowPred = (WinnowPrediction) typedPred;
173 result = winnowPred.getNormalizedScore();
174 } else {
175 result = Float.NaN;
176 }
177
178 return result;
179 }
180
181 /***
182 * Returns the {@linkplain
183 * de.fu_berlin.ties.classify.winnow.Winnow#normalizeScore normalized score}
184 * for the <em>spam</em> class. "Good" scores are > 1, "bad" scores
185 * are in the <code>]0, 1[</code> rande and 1.0 means "don't know".
186 *
187 * @return the score for the spam class
188 */
189 public float getSpamScore() {
190 return getScoreForType(SpamFilterDemo.CLASS_SPAM);
191 }
192
193 /***
194 * Returns a string representation of this object.
195 *
196 * @return a textual representation
197 */
198 public String toString() {
199 return new ToStringBuilder(this)
200 .append("predicted class", getPredictedClass())
201 .append("probability", getProbability())
202 .append("spam score", getSpamScore())
203 .append("nonspam score", getNonspamScore())
204 .toString();
205 }
206
207 /***
208 * Writes attribute name=value pairs that must be inserted into the
209 * opening <code><body ...></code> tag of a HTML file containing the
210 * output of the {@link #writeVizualization(Writer)} method. These attribute
211 * ("onload" and "onkeydown") are necessary for initialization the
212 * JavaScript code used for vizualation.
213 *
214 * @param writer writer to append the HTML code to; neither flushed nor
215 * closed by this method
216 * @throws IOException if an I/O error occurs while writing
217 */
218 public void writeBodyAttribute(final Writer writer) throws IOException {
219
220 writer.write(" onload=\"initHandlers();\"");
221 writer.write(" onkeydown=\"keyPressed(event);\" ");
222 }
223
224 /***
225 * Writes HTML code that must be inserted into the contents of the
226 * <code><head></code> element of a HTML file containing the output
227 * of the {@link #writeVizualization(Writer)} method.
228 *
229 * @param writer writer to append the HTML code to; neither flushed nor
230 * closed by this method
231 * @throws IOException if an I/O error occurs while writing
232 */
233 public void writeHTMLHead(final Writer writer) throws IOException {
234 final ContextMap contextMap = new ContextMap();
235 contextMap.put("featureWeights", featureWeights);
236
237
238 final Iterator<Prediction> predIter = predDist.iterator();
239 WinnowPrediction pred;
240
241 while (predIter.hasNext()) {
242 pred = (WinnowPrediction) predIter.next();
243 contextMap.put(pred.getType() + "ScoreWinnow", pred.getRawScore());
244 }
245
246 try {
247 VelocityService.renderTemplate("demo-head", contextMap, writer);
248 } catch (VelocityException ve) {
249
250 throw new RuntimeException(ve);
251 }
252 }
253
254 /***
255 * Helper method that generated an empty "div" element for the paragraph
256 * showing the feature scores.
257 *
258 * @param writer the writer to write to
259 * @param idSuffix suffix to append to {@link #FEATURE_SCORE_ID} to form the
260 * ID of this element, separated by "-"
261 * @throws IOException if an I/O error occurs while writing
262 */
263 private void writeFeatureScoreElem(final Writer writer,
264 final String idSuffix) throws IOException {
265 writer.write("<div class=\"" + FEATURE_SCORE_ID + "\" id=\""
266 + FEATURE_SCORE_ID + "-" + idSuffix + "\"> </div>");
267 }
268
269
270 /***
271 * Writes a simple but complete HTML file that combines the output of
272 * of the {@link #writeHTMLHead(Writer)},
273 * {@link #writeBodyAttribute(Writer)} and
274 * {@link #writeVizualization(Writer)} methods.
275 *
276 * <p>This method is implemented as followings:
277 * <pre>
278 * writer.write("<html><head>");
279 * writeHTMLHead(writer);
280 * writer.write("</head><body ");
281 * writeBodyAttribute(writer);
282 * writer.write(">");
283 * writeVizualization(writer);
284 * writer.write("</body></html>\n");
285 * writer.flush();
286 * </pre>
287 *
288 * @param writer writer to writer the HTML file to; flushed but not
289 * closed by this method
290 * @throws IOException if an I/O error occurs while writing
291 */
292 public void writeTestHTML(final Writer writer) throws IOException {
293 writer.write("<html><head>");
294 writeHTMLHead(writer);
295 writer.write("</head><body ");
296 writeBodyAttribute(writer);
297 writer.write(">");
298 writeVizualization(writer);
299 writer.write("</body></html>\n");
300 writer.flush();
301 }
302
303 /***
304 * Writes an HTML fragment that contains a vizualization of the classified
305 * mail (showing which features have been most important for classification
306 * etc.). Note that you <b>must</b> insert the output of the
307 * {@link #writeHTMLHead(Writer)} method into the contents of the
308 * <code><head></code> element and the output of the
309 * {@link #writeBodyAttribute(Writer)} method into the opening
310 * <code><body ...></code> tag of the HTML file prior to calling this
311 * method or the Dynamic HTML code generated by this method will not work at
312 * all.
313 *
314 * @param writer writer to append the HTML code to; neither flushed nor
315 * closed by this method
316 * @throws IOException if an I/O error occurs while writing
317 */
318 public void writeVizualization(final Writer writer) throws IOException {
319
320 writer.write("<pre>");
321
322
323 synchronized (featureExtractor) {
324 final TextTokenizer tokenizer = featureExtractor.getTokenizer();
325 tokenizer.reset(text);
326 String token;
327
328 while ((token = tokenizer.nextToken()) != null) {
329 writer.write(tokenizer.precedingWhitespace());
330 writer.write("<tok>");
331 writer.write(StringEscapeUtils.escapeXml(token));
332 writer.write("</tok>");
333 }
334 }
335
336
337 for (int i = 0; i < 10; i++) {
338 writer.write(TextUtils.LINE_SEPARATOR);
339 }
340 writer.write("</pre>");
341
342
343 writer.write(TextUtils.LINE_SEPARATOR);
344 writeFeatureScoreElem(writer, "feature");
345 writeFeatureScoreElem(writer, SpamFilterDemo.CLASS_NONSPAM);
346 writeFeatureScoreElem(writer, SpamFilterDemo.CLASS_SPAM);
347 }
348
349 }