View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.filter;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.util.Collections;
27  import java.util.SortedSet;
28  import java.util.TreeSet;
29  
30  import org.apache.commons.lang.builder.ToStringBuilder;
31  import org.dom4j.Document;
32  import org.dom4j.Element;
33  import org.dom4j.NodeFilter;
34  
35  import de.fu_berlin.ties.ProcessingException;
36  import de.fu_berlin.ties.TiesConfiguration;
37  import de.fu_berlin.ties.classify.PredictionDistribution;
38  import de.fu_berlin.ties.classify.Reranker;
39  import de.fu_berlin.ties.classify.TrainableClassifier;
40  import de.fu_berlin.ties.classify.feature.FeatureVector;
41  import de.fu_berlin.ties.util.Util;
42  
43  /***
44   * An abstract filter that uses a
45   * {@link de.fu_berlin.ties.classify.TrainableClassifier trainable classifier}
46   * for training. Subclasses must implement the {@link #buildFeatures(Element)}
47   * method to convert elements into feature vectors suitable for the classifier.
48   * 
49   * <p>
50   * Instances of this class are not thread-safe and must be synchronized
51   * externally, if required.
52   * 
53   * @author Christian Siefkes
54   * @version $Revision: 1.13 $, $Date: 2006/10/21 16:04:20 $, $Author: siefkes $
55   */
56  public abstract class TrainableFilter implements ElementFilter {
57  
58      /***
59       * An immutable set of the target classes for the classifier: the strings
60       * "false" and "true", in alphabetic order.
61       */
62      public static final SortedSet<String> BOOLEAN_CLASSES;
63  
64      /***
65       * Initialize target classes.
66       */
67      static {
68          final SortedSet<String> classes = new TreeSet<String>();
69          classes.add(Boolean.FALSE.toString());
70          classes.add(Boolean.TRUE.toString());
71          BOOLEAN_CLASSES = Collections.unmodifiableSortedSet(classes);
72      }
73  
74      /***
75       * The trainable classifier used by this instance.
76       */
77      private final TrainableClassifier classifier;
78  
79      /***
80       * The last element that was classified by this instance, if any.
81       */
82      private Element lastElement;
83  
84      /***
85       * The feature vector representing the {@link #lastElement last element
86       * classified by this instance}, if any.
87       */
88      private FeatureVector lastFeatures;
89  
90      /***
91       * This filter is queried to decide whether this class would like to
92       * {@link #avoids(Element) avoid}an element.
93       */
94      private NodeFilter avoidFilter;
95  
96      /***
97       * This filter is queried to decide whether this class
98       * {@link #prefers(Element) prefers}an element.
99       */
100     private NodeFilter preferredFilter;
101 
102     /***
103      * An optional reranker that recalculates probabilities to introduce a bias.
104      * This can be used to favor recall over precision (by setting a bias &gt;
105      * 1 for the "true" class) etc.
106      */
107     private final Reranker reranker;
108 
109     /***
110      * Creates a new instance.
111      * 
112      * @param conf used to initialize the trainable classifier by calling
113      * {@link TrainableClassifier#createClassifier(java.util.Set,
114      * TiesConfiguration, String)} with an optional "filter" suffix
115      * @param positiveFilter this filter is queried to decide whether this class
116      * {@link #prefers(Element) prefers}an element
117      * @param negativeFilter this filter is queried to decide whether this class
118      * {@link #avoids(Element) avoids}an element
119      * @param rerank an optional reranker that recalculates probabilities to
120      * introduce a bias (can be used to favor recall over precision, by setting
121      * a bias &gt; 1 for the "true" class, etc.); might be <code>null</code>
122      * @throws ProcessingException if the initialization of the trainable
123      * classifier fails
124      */
125     public TrainableFilter(final TiesConfiguration conf,
126             final NodeFilter positiveFilter, final NodeFilter negativeFilter,
127             final Reranker rerank) throws ProcessingException {
128         super();
129         // "filter" suffix can be used load a special classifier for filters
130         classifier = TrainableClassifier.createClassifier(BOOLEAN_CLASSES,
131                 conf, "filter");
132         preferredFilter = positiveFilter;
133         avoidFilter = negativeFilter;
134         reranker = rerank;
135     }
136 
137     /***
138      * Converts an element into a feature vector to be used by the trainable
139      * classifier.
140      * 
141      * @param element the element to convert
142      * @return a feature vector representing the provided element
143      */
144     public abstract FeatureVector buildFeatures(final Element element);
145 
146     /***
147      * {@inheritDoc}
148      */
149     public void init(final Document document, final File filename)
150     throws ProcessingException, IOException {
151         // do nothing -- subclasses can add their own behavior
152     }
153 
154     /***
155      * {@inheritDoc}
156      */
157     public boolean avoids(final Element element) {
158         return avoidFilter.matches(element);
159     }
160 
161     /***
162      * {@inheritDoc}
163      */
164     public boolean matches(final Element element) throws ProcessingException {
165         final FeatureVector features = buildFeatures(element);
166 
167         // cache element + features
168         lastElement = element;
169         lastFeatures = features;
170 
171         // invoke classifier + interpret result
172         final PredictionDistribution origDist = classifier.classify(features,
173                 BOOLEAN_CLASSES);
174 
175         // rerank results, if supported
176         final PredictionDistribution finalDist;
177         if (reranker != null) {
178             finalDist = reranker.rerank(origDist);
179         } else {
180             finalDist = origDist;
181         }
182 
183         final String bestClass = finalDist.best().getType();
184         return Util.asBoolean(bestClass);
185     }
186 
187     /***
188      * {@inheritDoc}
189      */
190     public boolean prefers(final Element element) {
191         return preferredFilter.matches(element);
192     }
193 
194     /***
195      * Returns a string representation of this object.
196      * 
197      * @return a textual representation
198      */
199     public String toString() {
200         return new ToStringBuilder(this)
201                 .append("classifier", classifier)
202                 .append("preferred filter", preferredFilter)
203                 .append("reranker", reranker)
204                 .toString();
205     }
206 
207     /***
208      * Trains the correct decision for an element by calling the
209      * {@link TrainableClassifier#trainOnError(FeatureVector, String,
210      * java.util.Set)} method on the stored trainable classifier. By using this
211      * method, the classifier will train itself only when necessary.
212      * 
213      * @param element the element to train
214      * @param decision the correct decision for the element -- whether or not it
215      * should be accepted
216      * @return the prediction distribution returned by {@link
217      * TrainableClassifier#trainOnError(FeatureVector, String, java.util.Set)}
218      * @throws ProcessingException if an error occurs during training
219      */
220     public PredictionDistribution trainIfNecessary(final Element element,
221             final boolean decision) throws ProcessingException {
222         final FeatureVector features;
223 
224         if (element.equals(lastElement)) {
225             // we've seen this element before -- re-use features
226             features = lastFeatures;
227         } else {
228             // build features for element
229             features = buildFeatures(element);
230         }
231 
232         // delegate to classifier and return result
233         return classifier.trainOnError(features, Boolean.toString(decision),
234                 BOOLEAN_CLASSES);
235     }
236 
237 }