1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.filter;
23
24 import java.util.Collections;
25 import java.util.SortedSet;
26 import java.util.TreeSet;
27
28 import org.apache.commons.lang.builder.ToStringBuilder;
29 import org.dom4j.Document;
30 import org.dom4j.Element;
31 import org.dom4j.NodeFilter;
32
33 import de.fu_berlin.ties.ProcessingException;
34 import de.fu_berlin.ties.TiesConfiguration;
35 import de.fu_berlin.ties.classify.PredictionDistribution;
36 import de.fu_berlin.ties.classify.Reranker;
37 import de.fu_berlin.ties.classify.TrainableClassifier;
38 import de.fu_berlin.ties.classify.feature.FeatureVector;
39 import de.fu_berlin.ties.util.Util;
40
41 /***
42 * An abstract filter that uses a
43 * {@link de.fu_berlin.ties.classify.TrainableClassifier trainable classifier}
44 * for training. Subclasses must implement the {@link #buildFeatures(Element)}
45 * method to convert elements into feature vectors suitable for the classifier.
46 *
47 * <p>
48 * Instances of this class are not thread-safe and must be synchronized
49 * externally, if required.
50 *
51 * @author Christian Siefkes
52 * @version $Revision: 1.8 $, $Date: 2004/12/09 18:09:46 $, $Author: siefkes $
53 */
54 public abstract class TrainableFilter implements ElementFilter {
55
56 /***
57 * An immutable set of the target classes for the classifier: the strings
58 * "false" and "true", in alphabetic order.
59 */
60 public static final SortedSet<String> BOOLEAN_CLASSES;
61
62 /***
63 * Initialize target classes.
64 */
65 static {
66 final SortedSet<String> classes = new TreeSet<String>();
67 classes.add(Boolean.FALSE.toString());
68 classes.add(Boolean.TRUE.toString());
69 BOOLEAN_CLASSES = Collections.unmodifiableSortedSet(classes);
70 }
71
72 /***
73 * The trainable classifier used by this instance.
74 */
75 private final TrainableClassifier classifier;
76
77 /***
78 * The last element that was classified by this instance, if any.
79 */
80 private Element lastElement;
81
82 /***
83 * The feature vector representing the {@link #lastElement last element
84 * classified by this instance}, if any.
85 */
86 private FeatureVector lastFeatures;
87
88 /***
89 * This filter is queried to decide whether this class would like to
90 * {@link #avoids(Element) avoid}an element.
91 */
92 private NodeFilter avoidFilter;
93
94 /***
95 * This filter is queried to decide whether this class
96 * {@link #prefers(Element) prefers}an element.
97 */
98 private NodeFilter preferredFilter;
99
100 /***
101 * An optional reranker that recalculates probabilities to introduce a bias.
102 * This can be used to favor recall over precision (by setting a bias >
103 * 1 for the "true" class) etc.
104 */
105 private final Reranker reranker;
106
107 /***
108 * Creates a new instance.
109 *
110 * @param conf used to initialize the trainable classifier by calling
111 * {@link TrainableClassifier#createClassifier(java.util.Set,
112 * TiesConfiguration)}
113 * @param positiveFilter this filter is queried to decide whether this class
114 * {@link #prefers(Element) prefers}an element
115 * @param negativeFilter this filter is queried to decide whether this class
116 * {@link #avoids(Element) avoids}an element
117 * @param rerank an optional reranker that recalculates probabilities to
118 * introduce a bias (can be used to favor recall over precision, by setting
119 * a bias > 1 for the "true" class, etc.); might be <code>null</code>
120 * @throws ProcessingException if the initialization of the trainable
121 * classifier fails
122 */
123 public TrainableFilter(final TiesConfiguration conf,
124 final NodeFilter positiveFilter, final NodeFilter negativeFilter,
125 final Reranker rerank) throws ProcessingException {
126 super();
127 classifier =
128 TrainableClassifier.createClassifier(BOOLEAN_CLASSES, conf);
129 preferredFilter = positiveFilter;
130 avoidFilter = negativeFilter;
131 reranker = rerank;
132 }
133
134 /***
135 * Converts an element into a feature vector to be used by the trainable
136 * classifier.
137 *
138 * @param element the element to convert
139 * @return a feature vector representing the provided element
140 */
141 public abstract FeatureVector buildFeatures(final Element element);
142
143 /***
144 * {@inheritDoc}
145 */
146 public void init(final Document document) {
147
148 }
149
150 /***
151 * {@inheritDoc}
152 */
153 public boolean avoids(final Element element) {
154 return avoidFilter.matches(element);
155 }
156
157 /***
158 * {@inheritDoc}
159 */
160 public boolean matches(final Element element) throws ProcessingException {
161 final FeatureVector features = buildFeatures(element);
162
163
164 lastElement = element;
165 lastFeatures = features;
166
167
168 final PredictionDistribution origDist = classifier.classify(features,
169 BOOLEAN_CLASSES);
170
171
172 final PredictionDistribution finalDist;
173 if (reranker != null) {
174 finalDist = reranker.rerank(origDist);
175 } else {
176 finalDist = origDist;
177 }
178
179 final String bestClass = finalDist.best().getType();
180 return Util.asBoolean(bestClass);
181 }
182
183 /***
184 * {@inheritDoc}
185 */
186 public boolean prefers(final Element element) {
187 return preferredFilter.matches(element);
188 }
189
190 /***
191 * Returns a string representation of this object.
192 *
193 * @return a textual representation
194 */
195 public String toString() {
196 return new ToStringBuilder(this)
197 .append("classifier", classifier)
198 .append("preferred filter", preferredFilter)
199 .append("reranker", reranker)
200 .toString();
201 }
202
203 /***
204 * Trains the correct decision for an element by calling the
205 * {@link TrainableClassifier#trainOnError(FeatureVector, String,
206 * java.util.Set)} method on the stored trainable classifier. By using this
207 * method, the classifier will train itself only when necessary.
208 *
209 * @param element the element to train
210 * @param decision the correct decision for the element -- whether or not it
211 * should be accepted
212 * @return the prediction distribution returned by {@link
213 * TrainableClassifier#trainOnError(FeatureVector, String, java.util.Set)}
214 * @throws ProcessingException if an error occurs during training
215 */
216 public PredictionDistribution trainIfNecessary(final Element element,
217 final boolean decision) throws ProcessingException {
218 final FeatureVector features;
219
220 if (element.equals(lastElement)) {
221
222 features = lastFeatures;
223 } else {
224
225 features = buildFeatures(element);
226 }
227
228
229 return classifier.trainOnError(features, Boolean.toString(decision),
230 BOOLEAN_CLASSES);
231 }
232
233 }