1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.filter;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.util.Collections;
27 import java.util.SortedSet;
28 import java.util.TreeSet;
29
30 import org.apache.commons.lang.builder.ToStringBuilder;
31 import org.dom4j.Document;
32 import org.dom4j.Element;
33 import org.dom4j.NodeFilter;
34
35 import de.fu_berlin.ties.ProcessingException;
36 import de.fu_berlin.ties.TiesConfiguration;
37 import de.fu_berlin.ties.classify.PredictionDistribution;
38 import de.fu_berlin.ties.classify.Reranker;
39 import de.fu_berlin.ties.classify.TrainableClassifier;
40 import de.fu_berlin.ties.classify.feature.FeatureVector;
41 import de.fu_berlin.ties.util.Util;
42
43 /***
44 * An abstract filter that uses a
45 * {@link de.fu_berlin.ties.classify.TrainableClassifier trainable classifier}
46 * for training. Subclasses must implement the {@link #buildFeatures(Element)}
47 * method to convert elements into feature vectors suitable for the classifier.
48 *
49 * <p>
50 * Instances of this class are not thread-safe and must be synchronized
51 * externally, if required.
52 *
53 * @author Christian Siefkes
54 * @version $Revision: 1.13 $, $Date: 2006/10/21 16:04:20 $, $Author: siefkes $
55 */
56 public abstract class TrainableFilter implements ElementFilter {
57
58 /***
59 * An immutable set of the target classes for the classifier: the strings
60 * "false" and "true", in alphabetic order.
61 */
62 public static final SortedSet<String> BOOLEAN_CLASSES;
63
64 /***
65 * Initialize target classes.
66 */
67 static {
68 final SortedSet<String> classes = new TreeSet<String>();
69 classes.add(Boolean.FALSE.toString());
70 classes.add(Boolean.TRUE.toString());
71 BOOLEAN_CLASSES = Collections.unmodifiableSortedSet(classes);
72 }
73
74 /***
75 * The trainable classifier used by this instance.
76 */
77 private final TrainableClassifier classifier;
78
79 /***
80 * The last element that was classified by this instance, if any.
81 */
82 private Element lastElement;
83
84 /***
85 * The feature vector representing the {@link #lastElement last element
86 * classified by this instance}, if any.
87 */
88 private FeatureVector lastFeatures;
89
90 /***
91 * This filter is queried to decide whether this class would like to
92 * {@link #avoids(Element) avoid}an element.
93 */
94 private NodeFilter avoidFilter;
95
96 /***
97 * This filter is queried to decide whether this class
98 * {@link #prefers(Element) prefers}an element.
99 */
100 private NodeFilter preferredFilter;
101
102 /***
103 * An optional reranker that recalculates probabilities to introduce a bias.
104 * This can be used to favor recall over precision (by setting a bias >
105 * 1 for the "true" class) etc.
106 */
107 private final Reranker reranker;
108
109 /***
110 * Creates a new instance.
111 *
112 * @param conf used to initialize the trainable classifier by calling
113 * {@link TrainableClassifier#createClassifier(java.util.Set,
114 * TiesConfiguration, String)} with an optional "filter" suffix
115 * @param positiveFilter this filter is queried to decide whether this class
116 * {@link #prefers(Element) prefers}an element
117 * @param negativeFilter this filter is queried to decide whether this class
118 * {@link #avoids(Element) avoids}an element
119 * @param rerank an optional reranker that recalculates probabilities to
120 * introduce a bias (can be used to favor recall over precision, by setting
121 * a bias > 1 for the "true" class, etc.); might be <code>null</code>
122 * @throws ProcessingException if the initialization of the trainable
123 * classifier fails
124 */
125 public TrainableFilter(final TiesConfiguration conf,
126 final NodeFilter positiveFilter, final NodeFilter negativeFilter,
127 final Reranker rerank) throws ProcessingException {
128 super();
129
130 classifier = TrainableClassifier.createClassifier(BOOLEAN_CLASSES,
131 conf, "filter");
132 preferredFilter = positiveFilter;
133 avoidFilter = negativeFilter;
134 reranker = rerank;
135 }
136
137 /***
138 * Converts an element into a feature vector to be used by the trainable
139 * classifier.
140 *
141 * @param element the element to convert
142 * @return a feature vector representing the provided element
143 */
144 public abstract FeatureVector buildFeatures(final Element element);
145
146 /***
147 * {@inheritDoc}
148 */
149 public void init(final Document document, final File filename)
150 throws ProcessingException, IOException {
151
152 }
153
154 /***
155 * {@inheritDoc}
156 */
157 public boolean avoids(final Element element) {
158 return avoidFilter.matches(element);
159 }
160
161 /***
162 * {@inheritDoc}
163 */
164 public boolean matches(final Element element) throws ProcessingException {
165 final FeatureVector features = buildFeatures(element);
166
167
168 lastElement = element;
169 lastFeatures = features;
170
171
172 final PredictionDistribution origDist = classifier.classify(features,
173 BOOLEAN_CLASSES);
174
175
176 final PredictionDistribution finalDist;
177 if (reranker != null) {
178 finalDist = reranker.rerank(origDist);
179 } else {
180 finalDist = origDist;
181 }
182
183 final String bestClass = finalDist.best().getType();
184 return Util.asBoolean(bestClass);
185 }
186
187 /***
188 * {@inheritDoc}
189 */
190 public boolean prefers(final Element element) {
191 return preferredFilter.matches(element);
192 }
193
194 /***
195 * Returns a string representation of this object.
196 *
197 * @return a textual representation
198 */
199 public String toString() {
200 return new ToStringBuilder(this)
201 .append("classifier", classifier)
202 .append("preferred filter", preferredFilter)
203 .append("reranker", reranker)
204 .toString();
205 }
206
207 /***
208 * Trains the correct decision for an element by calling the
209 * {@link TrainableClassifier#trainOnError(FeatureVector, String,
210 * java.util.Set)} method on the stored trainable classifier. By using this
211 * method, the classifier will train itself only when necessary.
212 *
213 * @param element the element to train
214 * @param decision the correct decision for the element -- whether or not it
215 * should be accepted
216 * @return the prediction distribution returned by {@link
217 * TrainableClassifier#trainOnError(FeatureVector, String, java.util.Set)}
218 * @throws ProcessingException if an error occurs during training
219 */
220 public PredictionDistribution trainIfNecessary(final Element element,
221 final boolean decision) throws ProcessingException {
222 final FeatureVector features;
223
224 if (element.equals(lastElement)) {
225
226 features = lastFeatures;
227 } else {
228
229 features = buildFeatures(element);
230 }
231
232
233 return classifier.trainOnError(features, Boolean.toString(decision),
234 BOOLEAN_CLASSES);
235 }
236
237 }