View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.demo;
23  
24  import java.io.IOException;
25  import java.io.Writer;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  
30  import org.apache.commons.lang.StringEscapeUtils;
31  import org.apache.commons.lang.builder.ToStringBuilder;
32  import org.apache.velocity.exception.VelocityException;
33  
34  import de.fu_berlin.ties.ContextMap;
35  import de.fu_berlin.ties.classify.Prediction;
36  import de.fu_berlin.ties.classify.PredictionDistribution;
37  import de.fu_berlin.ties.classify.winnow.WinnowPrediction;
38  import de.fu_berlin.ties.text.TextTokenizer;
39  import de.fu_berlin.ties.text.TextUtils;
40  import de.fu_berlin.ties.text.TokenizingExtractor;
41  import de.fu_berlin.ties.util.VelocityService;
42  
43  /***
44   * Instances of this class contain detailed filtering results with
45   * additional debugging information. This class supports only the
46   * {@link de.fu_berlin.ties.classify.winnow.Winnow} classifier and subclasses.
47   *
48   * @author Christian Siefkes
49   * @version $Revision: 1.14 $, $Date: 2006/10/21 16:04:09 $, $Author: siefkes $
50   */
51  public class FilterResult {
52  
53      /***
54       * ID (prefix) used for the feature scores table and child elements.
55       */
56      private static final String FEATURE_SCORE_ID = "featureScoreTable";
57  
58  
59      /***
60       * Result returned by the classifier.
61       */
62      private final PredictionDistribution predDist;
63  
64      /***
65       * Used to convert text sequences into feature vectors.
66       * Synchronized on itself.
67       */
68      private final TokenizingExtractor featureExtractor;
69  
70      /***
71       * A mapping from feature representation to weights arrays, as returned by
72       * the {@link de.fu_berlin.ties.classify.winnow.Winnow#showFeatureWeights}
73       * method.
74       */
75      private final Map<String, List<Float>> featureWeights;
76  
77      /***
78       * The filtered text.
79       */
80      private final String text;
81  
82  
83      /***
84       * Creates a new instance.
85       *
86       * @param myPredDist the prediction distribution wrapped by this instance
87       * @param myText the filtered text
88       * @param extractor the feature extractor used to tokenize text sequences
89       * -- will be synchronized on itself
90       * @param myFeatureWeights a mapping from feature representation to weights
91       * arrays, as returned by the
92       * {@link de.fu_berlin.ties.classify.winnow.Winnow#showFeatureWeights}
93       * method
94       */
95      public FilterResult(final PredictionDistribution myPredDist,
96              final String myText,
97              final TokenizingExtractor extractor,
98              final Map<String, List<Float>> myFeatureWeights) {
99          super();
100 
101         if (myPredDist.size() < 1) {
102             throw new IllegalArgumentException("Empty prediction distribution");
103         }
104 
105         predDist = myPredDist;
106         text = myText;
107         featureExtractor = extractor;
108         featureWeights = myFeatureWeights;
109     }
110 
111 
112     /***
113      * Returns the predicted class: "spam" or "nonspam".
114      *
115      * @return the predicted class
116      */
117     public String getPredictedClass() {
118         return predDist.best().getType();
119     }
120 
121     /***
122      * Returns the probability of the most likely class.
123      *
124      * @return the probability of the predicted class, a number in the in range
125      * from 0.0 to 1.0
126      */
127     public double getProbability() {
128         return predDist.best().getProbability().getProb();
129     }
130 
131     /***
132      * Returns the {@linkplain
133      * de.fu_berlin.ties.classify.winnow.Winnow#normalizeScore normalized score}
134      * for the <em>nonspam</em> class. "Good" scores are &gt; 1, "bad" scores
135      * are in the <code>]0, 1[</code> rande and 1.0 means "don't know".
136      *
137      * @return the score for the nonspam class
138      */
139     public float getNonspamScore() {
140         return getScoreForType(SpamFilterDemo.CLASS_NONSPAM);
141     }
142 
143     /***
144      * Helper method that returns the {@linkplain
145      * de.fu_berlin.ties.classify.winnow.Winnow#normalizeScore normalized score}
146      * of a predicted type. This will only work for
147      * {@linkplain de.fu_berlin.ties.classify.winnow.Winnow Winnow}-based
148      * classifiers. Otherwise, or if the predicted type does not exist in the
149      * probability distribution, {@linkplain Float#NaN Not-a-Number} is
150      * returned.
151      *
152      * @param type the type to look up
153      * @return the normalized score of the given type, if applicable
154      */
155     private float getScoreForType(final String type) {
156         final float result;
157         Iterator<Prediction> predIter = predDist.iterator();
158         Prediction currentPred;
159         Prediction typedPred = null;
160 
161         // find matching prediction
162         while (predIter.hasNext() && (typedPred == null)) {
163             currentPred = predIter.next();
164             if (currentPred.getType().equals(type)) {
165                 // got it!
166                 typedPred = currentPred;
167             }
168         }
169 
170         if ((typedPred != null) && (typedPred instanceof WinnowPrediction)) {
171             // got matching prediction and it's of the correct subtype
172             final WinnowPrediction winnowPred = (WinnowPrediction) typedPred;
173             result = winnowPred.getNormalizedScore();
174         } else {
175             result = Float.NaN;
176         }
177 
178         return result;
179     }
180 
181     /***
182      * Returns the {@linkplain
183      * de.fu_berlin.ties.classify.winnow.Winnow#normalizeScore normalized score}
184      * for the <em>spam</em> class. "Good" scores are &gt; 1, "bad" scores
185      * are in the <code>]0, 1[</code> rande and 1.0 means "don't know".
186      *
187      * @return the score for the spam class
188      */
189     public float getSpamScore() {
190         return getScoreForType(SpamFilterDemo.CLASS_SPAM);
191     }
192 
193     /***
194      * Returns a string representation of this object.
195      *
196      * @return a textual representation
197      */
198     public String toString() {
199         return new ToStringBuilder(this)
200             .append("predicted class", getPredictedClass())
201             .append("probability", getProbability())
202             .append("spam score", getSpamScore())
203             .append("nonspam score", getNonspamScore())
204             .toString();
205     }
206 
207     /***
208      * Writes attribute name=value pairs that must be inserted into the
209      * opening <code>&lt;body ...&gt;</code> tag of a HTML file containing the
210      * output of the {@link #writeVizualization(Writer)} method. These attribute
211      * ("onload" and "onkeydown") are necessary for initialization the
212      * JavaScript code used for vizualation.
213      *
214      * @param writer writer to append the HTML code to; neither flushed nor
215      * closed by this method
216      * @throws IOException if an I/O error occurs while writing
217      */
218     public void writeBodyAttribute(final Writer writer) throws IOException {
219         // surround by whitespace just to make sure
220         writer.write(" onload=\"initHandlers();\"");
221         writer.write(" onkeydown=\"keyPressed(event);\" ");
222     }
223 
224     /***
225      * Writes HTML code that must be inserted into the contents of the
226      * <code>&lt;head&gt;</code> element of a HTML file containing the output
227      * of the {@link #writeVizualization(Writer)} method.
228      *
229      * @param writer writer to append the HTML code to; neither flushed nor
230      * closed by this method
231      * @throws IOException if an I/O error occurs while writing
232      */
233     public void writeHTMLHead(final Writer writer) throws IOException {
234         final ContextMap contextMap = new ContextMap();
235         contextMap.put("featureWeights", featureWeights);
236 
237         // store feature weights and scores for all predictions
238         final Iterator<Prediction> predIter = predDist.iterator();
239         WinnowPrediction pred;
240 
241         while (predIter.hasNext()) {
242             pred = (WinnowPrediction) predIter.next();
243             contextMap.put(pred.getType() + "ScoreWinnow", pred.getRawScore());
244         }
245 
246         try {
247             VelocityService.renderTemplate("demo-head", contextMap, writer);
248         } catch (VelocityException ve) {
249             // not supposed to happen -- wrap in uncaught exception
250             throw new RuntimeException(ve);
251         }
252     }
253 
254     /***
255      * Helper method that generated an empty "div" element for the paragraph
256      * showing the feature scores.
257      *
258      * @param writer the writer to write to
259      * @param idSuffix suffix to append to {@link #FEATURE_SCORE_ID} to form the
260      * ID of this element, separated by "-"
261      * @throws IOException if an I/O error occurs while writing
262      */
263     private void writeFeatureScoreElem(final Writer writer,
264             final String idSuffix) throws IOException {
265         writer.write("<div class=\"" + FEATURE_SCORE_ID + "\" id=\""
266                 + FEATURE_SCORE_ID + "-" + idSuffix + "\"> </div>");
267     }
268 
269 
270     /***
271      * Writes a simple but complete HTML file that combines the output of
272      * of the {@link #writeHTMLHead(Writer)},
273      * {@link #writeBodyAttribute(Writer)} and
274      * {@link #writeVizualization(Writer)} methods.
275      *
276      * <p>This method is implemented as followings:
277      * <pre>
278      *   writer.write("&lt;html>&lt;head>");
279      *   writeHTMLHead(writer);
280      *   writer.write("&lt;/head>&lt;body ");
281      *   writeBodyAttribute(writer);
282      *   writer.write(">");
283      *   writeVizualization(writer);
284      *   writer.write("&lt;/body>&lt;/html>\n");
285      *   writer.flush();
286      * </pre>
287      *
288      * @param writer writer to writer the HTML file to; flushed but not
289      * closed by this method
290      * @throws IOException if an I/O error occurs while writing
291      */
292     public void writeTestHTML(final Writer writer) throws IOException {
293         writer.write("<html><head>");
294         writeHTMLHead(writer);
295         writer.write("</head><body ");
296         writeBodyAttribute(writer);
297         writer.write(">");
298         writeVizualization(writer);
299         writer.write("</body></html>\n");
300         writer.flush();
301     }
302 
303     /***
304      * Writes an HTML fragment that contains a vizualization of the classified
305      * mail (showing which features have been most important for classification
306      * etc.). Note that you <b>must</b> insert the output of the
307      * {@link #writeHTMLHead(Writer)} method into the contents of the
308      * <code>&lt;head&gt;</code> element and the output of the
309      * {@link #writeBodyAttribute(Writer)} method into the opening
310      * <code>&lt;body ...&gt;</code> tag of the HTML file prior to calling this
311      * method or the Dynamic HTML code generated by this method will not work at
312      * all.
313      *
314      * @param writer writer to append the HTML code to; neither flushed nor
315      * closed by this method
316      * @throws IOException if an I/O error occurs while writing
317      */
318     public void writeVizualization(final Writer writer) throws IOException {
319         // open preformatted block
320         writer.write("<pre>");
321 
322         // synchronize on the extractor to avoid collisions
323         synchronized (featureExtractor) {
324             final TextTokenizer tokenizer = featureExtractor.getTokenizer();
325             tokenizer.reset(text);
326             String token;
327 
328             while ((token = tokenizer.nextToken()) != null) {
329                 writer.write(tokenizer.precedingWhitespace());
330                 writer.write("<tok>");
331                 writer.write(StringEscapeUtils.escapeXml(token));
332                 writer.write("</tok>");
333             }
334         }
335 
336         // append 10 lines for better scrolling + close block
337         for (int i = 0; i < 10; i++) {
338             writer.write(TextUtils.LINE_SEPARATOR);
339         }
340         writer.write("</pre>");
341 
342         // init elements showing feature scores (populated via JavaScript)
343         writer.write(TextUtils.LINE_SEPARATOR);
344         writeFeatureScoreElem(writer, "feature");
345         writeFeatureScoreElem(writer, SpamFilterDemo.CLASS_NONSPAM);
346         writeFeatureScoreElem(writer, SpamFilterDemo.CLASS_SPAM);
347     }
348 
349 }