View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.filter;
23  
24  import java.util.Collections;
25  import java.util.SortedSet;
26  import java.util.TreeSet;
27  
28  import org.apache.commons.lang.builder.ToStringBuilder;
29  import org.dom4j.Document;
30  import org.dom4j.Element;
31  import org.dom4j.NodeFilter;
32  
33  import de.fu_berlin.ties.ProcessingException;
34  import de.fu_berlin.ties.TiesConfiguration;
35  import de.fu_berlin.ties.classify.PredictionDistribution;
36  import de.fu_berlin.ties.classify.Reranker;
37  import de.fu_berlin.ties.classify.TrainableClassifier;
38  import de.fu_berlin.ties.classify.feature.FeatureVector;
39  import de.fu_berlin.ties.util.Util;
40  
41  /***
42   * An abstract filter that uses a
43   * {@link de.fu_berlin.ties.classify.TrainableClassifier trainable classifier}
44   * for training. Subclasses must implement the {@link #buildFeatures(Element)}
45   * method to convert elements into feature vectors suitable for the classifier.
46   * 
47   * <p>
48   * Instances of this class are not thread-safe and must be synchronized
49   * externally, if required.
50   * 
51   * @author Christian Siefkes
52   * @version $Revision: 1.8 $, $Date: 2004/12/09 18:09:46 $, $Author: siefkes $
53   */
54  public abstract class TrainableFilter implements ElementFilter {
55  
56      /***
57       * An immutable set of the target classes for the classifier: the strings
58       * "false" and "true", in alphabetic order.
59       */
60      public static final SortedSet<String> BOOLEAN_CLASSES;
61  
62      /***
63       * Initialize target classes.
64       */
65      static {
66          final SortedSet<String> classes = new TreeSet<String>();
67          classes.add(Boolean.FALSE.toString());
68          classes.add(Boolean.TRUE.toString());
69          BOOLEAN_CLASSES = Collections.unmodifiableSortedSet(classes);
70      }
71  
72      /***
73       * The trainable classifier used by this instance.
74       */
75      private final TrainableClassifier classifier;
76  
77      /***
78       * The last element that was classified by this instance, if any.
79       */
80      private Element lastElement;
81  
82      /***
83       * The feature vector representing the {@link #lastElement last element
84       * classified by this instance}, if any.
85       */
86      private FeatureVector lastFeatures;
87  
88      /***
89       * This filter is queried to decide whether this class would like to
90       * {@link #avoids(Element) avoid}an element.
91       */
92      private NodeFilter avoidFilter;
93  
94      /***
95       * This filter is queried to decide whether this class
96       * {@link #prefers(Element) prefers}an element.
97       */
98      private NodeFilter preferredFilter;
99  
100     /***
101      * An optional reranker that recalculates probabilities to introduce a bias.
102      * This can be used to favor recall over precision (by setting a bias &gt;
103      * 1 for the "true" class) etc.
104      */
105     private final Reranker reranker;
106 
107     /***
108      * Creates a new instance.
109      * 
110      * @param conf used to initialize the trainable classifier by calling
111      * {@link TrainableClassifier#createClassifier(java.util.Set,
112      * TiesConfiguration)}
113      * @param positiveFilter this filter is queried to decide whether this class
114      * {@link #prefers(Element) prefers}an element
115      * @param negativeFilter this filter is queried to decide whether this class
116      * {@link #avoids(Element) avoids}an element
117      * @param rerank an optional reranker that recalculates probabilities to
118      * introduce a bias (can be used to favor recall over precision, by setting
119      * a bias &gt; 1 for the "true" class, etc.); might be <code>null</code>
120      * @throws ProcessingException if the initialization of the trainable
121      * classifier fails
122      */
123     public TrainableFilter(final TiesConfiguration conf,
124             final NodeFilter positiveFilter, final NodeFilter negativeFilter,
125             final Reranker rerank) throws ProcessingException {
126         super();
127         classifier =
128             TrainableClassifier.createClassifier(BOOLEAN_CLASSES, conf);
129         preferredFilter = positiveFilter;
130         avoidFilter = negativeFilter;
131         reranker = rerank;
132     }
133 
134     /***
135      * Converts an element into a feature vector to be used by the trainable
136      * classifier.
137      * 
138      * @param element the element to convert
139      * @return a feature vector representing the provided element
140      */
141     public abstract FeatureVector buildFeatures(final Element element);
142 
143     /***
144      * {@inheritDoc}
145      */
146     public void init(final Document document) {
147         // do nothing -- subclasses can add their own behavior
148     }
149 
150     /***
151      * {@inheritDoc}
152      */
153     public boolean avoids(final Element element) {
154         return avoidFilter.matches(element);
155     }
156 
157     /***
158      * {@inheritDoc}
159      */
160     public boolean matches(final Element element) throws ProcessingException {
161         final FeatureVector features = buildFeatures(element);
162 
163         // cache element + features
164         lastElement = element;
165         lastFeatures = features;
166 
167         // invoke classifier + interpret result
168         final PredictionDistribution origDist = classifier.classify(features,
169                 BOOLEAN_CLASSES);
170 
171         // rerank results, if supported
172         final PredictionDistribution finalDist;
173         if (reranker != null) {
174             finalDist = reranker.rerank(origDist);
175         } else {
176             finalDist = origDist;
177         }
178 
179         final String bestClass = finalDist.best().getType();
180         return Util.asBoolean(bestClass);
181     }
182 
183     /***
184      * {@inheritDoc}
185      */
186     public boolean prefers(final Element element) {
187         return preferredFilter.matches(element);
188     }
189 
190     /***
191      * Returns a string representation of this object.
192      * 
193      * @return a textual representation
194      */
195     public String toString() {
196         return new ToStringBuilder(this)
197                 .append("classifier", classifier)
198                 .append("preferred filter", preferredFilter)
199                 .append("reranker", reranker)
200                 .toString();
201     }
202 
203     /***
204      * Trains the correct decision for an element by calling the
205      * {@link TrainableClassifier#trainOnError(FeatureVector, String,
206      * java.util.Set)} method on the stored trainable classifier. By using this
207      * method, the classifier will train itself only when necessary.
208      * 
209      * @param element the element to train
210      * @param decision the correct decision for the element -- whether or not it
211      * should be accepted
212      * @return the prediction distribution returned by {@link
213      * TrainableClassifier#trainOnError(FeatureVector, String, java.util.Set)}
214      * @throws ProcessingException if an error occurs during training
215      */
216     public PredictionDistribution trainIfNecessary(final Element element,
217             final boolean decision) throws ProcessingException {
218         final FeatureVector features;
219 
220         if (element.equals(lastElement)) {
221             // we've seen this element before -- re-use features
222             features = lastFeatures;
223         } else {
224             // build features for element
225             features = buildFeatures(element);
226         }
227 
228         // delegate to classifier and return result
229         return classifier.trainOnError(features, Boolean.toString(decision),
230                 BOOLEAN_CLASSES);
231     }
232 
233 }