1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.filter;
23
24 import org.dom4j.Element;
25
26 import de.fu_berlin.ties.ProcessingException;
27 import de.fu_berlin.ties.text.TokenizerFactory;
28 import de.fu_berlin.ties.util.Util;
29 import de.fu_berlin.ties.xml.dom.DOMUtils;
30 import de.fu_berlin.ties.xml.dom.TokenProcessor;
31
32 /***
33 * A filtering token walker that can be trained.
34 *
35 * @author Christian Siefkes
36 * @version $Revision: 1.8 $, $Date: 2004/09/15 15:56:54 $, $Author: siefkes $
37 */
38 public class TrainableFilteringTokenWalker extends FilteringTokenWalker {
39
40 /***
41 * Training the embedded filter can be disabled by setting this to
42 * <code>false</code>.
43 */
44 private final boolean trainingEnabled;
45
46 /***
47 * Oracle queried to decide which elements should be accepted by the
48 * trainable filter.
49 */
50 private final Oracle oracle;
51
52
53 /***
54 * Creates a new instance, enabling training the filter.
55 *
56 * @param processor used to process the tokens
57 * @param tFactory used to instantiate tokenizers
58 * @param elementFilter the trainable element filter to use
59 * @param sHandler a handler that is called whenever some tokens are
60 * skipped; may be <code>null</code>
61 * @param elementOracle oracle queried to decide which elements should be
62 * accepted by the trainable filter
63 */
64 public TrainableFilteringTokenWalker(final TokenProcessor processor,
65 final TokenizerFactory tFactory,
66 final TrainableFilter elementFilter,
67 final SkipHandler sHandler,
68 final Oracle elementOracle) {
69 this(processor, tFactory, elementFilter, sHandler, elementOracle, true);
70 }
71
72 /***
73 * Creates a new instance.
74 *
75 * @param processor used to process the tokens
76 * @param tFactory used to instantiate tokenizers
77 * @param elementFilter the trainable element filter to use
78 * @param sHandler a handler that is called whenever some tokens are
79 * skipped; may be <code>null</code>
80 * @param elementOracle oracle queried to decide which elements should be
81 * accepted by the trainable filter
82 * @param enableTraining if <code>true</code> the embedded filter is
83 * trained from the decisions of the oracle; otherwise the oracle is only
84 * queried to log if the filter made a mistake
85 */
86 public TrainableFilteringTokenWalker(final TokenProcessor processor,
87 final TokenizerFactory tFactory,
88 final TrainableFilter elementFilter,
89 final SkipHandler sHandler,
90 final Oracle elementOracle,
91 final boolean enableTraining) {
92 super(processor, tFactory, elementFilter, sHandler);
93 oracle = elementOracle;
94 trainingEnabled = enableTraining;
95 }
96
97
98 /***
99 * {@inheritDoc} This implementation relies on the oracle to make the final
100 * decision and joins the predicted <code>decision</code> and the correct
101 * decision via OR. This allows the next step to view the tokenized text
102 * in all necessary cases (if it should view it as determined by the oracle,
103 * or it is would view it because of the the trainable classifer's
104 * prediction). It also gives the trainable filter to chance to train itself
105 * on the correct decision -- even if the original decision was already
106 * correct since there are classifiers (e.g. Winnow) that are not purely
107 * error-driven but also learn from reinforcement of (some) correct
108 * instances.
109 */
110 protected boolean handleAccept(final Element element,
111 final Element filteredElement, final boolean decision)
112 throws ProcessingException {
113 final boolean correctDecision = oracle.shouldMatch(element);
114
115
116 if (filteredElement != null) {
117
118 if (decision != correctDecision) {
119 Util.LOG.debug("Trainable filter predicted " + decision
120 + " instead of " + correctDecision + " for "
121 + DOMUtils.showElement(filteredElement));
122 }
123
124 if (trainingEnabled) {
125
126
127 final TrainableFilter filter = (TrainableFilter) getFilter();
128 filter.trainIfNecessary(filteredElement, correctDecision);
129 }
130 }
131
132
133 return decision || correctDecision;
134 }
135
136 /***
137 * Returns <code>true</code> if training the embedded filter is enabled
138 * (default).
139 *
140 * @return whether training is enabled
141 */
142 public boolean isTrainingEnabled() {
143 return trainingEnabled;
144 }
145
146 }