View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.filter;
23  
24  import org.dom4j.Element;
25  
26  import de.fu_berlin.ties.ProcessingException;
27  import de.fu_berlin.ties.text.TokenizerFactory;
28  import de.fu_berlin.ties.util.Util;
29  import de.fu_berlin.ties.xml.dom.DOMUtils;
30  import de.fu_berlin.ties.xml.dom.TokenProcessor;
31  
32  /***
33   * A filtering token walker that can be trained.
34   *
35   * @author Christian Siefkes
36   * @version $Revision: 1.8 $, $Date: 2004/09/15 15:56:54 $, $Author: siefkes $
37   */
38  public class TrainableFilteringTokenWalker extends FilteringTokenWalker {
39  
40      /***
41       * Training the embedded filter can be disabled by setting this to
42       * <code>false</code>.
43       */
44      private final boolean trainingEnabled;
45  
46      /***
47       * Oracle queried to decide which elements should be accepted by the
48       * trainable filter.
49       */
50      private final Oracle oracle;
51  
52  
53      /***
54       * Creates a new instance, enabling training the filter.
55       *
56       * @param processor used to process the tokens
57       * @param tFactory used to instantiate tokenizers
58       * @param elementFilter the trainable element filter to use
59       * @param sHandler a handler that is called whenever some tokens are
60       * skipped; may be <code>null</code>
61       * @param elementOracle oracle queried to decide which elements should be
62       * accepted by the trainable filter
63       */
64      public TrainableFilteringTokenWalker(final TokenProcessor processor,
65                                           final TokenizerFactory tFactory,
66                                           final TrainableFilter elementFilter,
67                                           final SkipHandler sHandler,
68                                           final Oracle elementOracle) {
69          this(processor, tFactory, elementFilter, sHandler, elementOracle, true);
70      }
71  
72      /***
73       * Creates a new instance.
74       *
75       * @param processor used to process the tokens
76       * @param tFactory used to instantiate tokenizers
77       * @param elementFilter the trainable element filter to use
78       * @param sHandler a handler that is called whenever some tokens are
79       * skipped; may be <code>null</code>
80       * @param elementOracle oracle queried to decide which elements should be
81       * accepted by the trainable filter
82       * @param enableTraining if <code>true</code> the embedded filter is
83       * trained from the decisions of the oracle; otherwise the oracle is only
84       * queried to log if the filter made a mistake
85       */
86      public TrainableFilteringTokenWalker(final TokenProcessor processor,
87                                           final TokenizerFactory tFactory,
88                                           final TrainableFilter elementFilter,
89                                           final SkipHandler sHandler,
90                                           final Oracle elementOracle,
91                                           final boolean enableTraining) {
92          super(processor, tFactory, elementFilter, sHandler);
93          oracle = elementOracle;
94          trainingEnabled = enableTraining;
95      }
96  
97  
98      /***
99       * {@inheritDoc} This implementation relies on the oracle to make the final
100      * decision and joins the predicted <code>decision</code> and the correct
101      * decision via OR. This allows the next step to view the tokenized text
102      * in all necessary cases (if it should view it as determined by the oracle,
103      * or it is would view it because of the the trainable classifer's
104      * prediction). It also gives the trainable filter to chance to train itself
105      * on the correct decision -- even if the original decision was already
106      * correct since there are classifiers (e.g. Winnow) that are not purely
107      * error-driven but also learn from reinforcement of (some) correct
108      * instances.
109      */
110     protected boolean handleAccept(final Element element,
111             final Element filteredElement,  final boolean decision)
112     throws ProcessingException {
113         final boolean correctDecision = oracle.shouldMatch(element);
114 
115         // check prediction if the filter was asked
116         if (filteredElement != null) {
117             // log if mistake
118             if (decision != correctDecision) {
119                 Util.LOG.debug("Trainable filter predicted " + decision
120                         + " instead of " + correctDecision + " for "
121                         + DOMUtils.showElement(filteredElement));
122             }
123 
124             if (trainingEnabled) {
125                 // give trainable filter a chance to train itself (we know it is
126                 // trainable because of our constructor)
127                 final TrainableFilter filter = (TrainableFilter) getFilter();
128                 filter.trainIfNecessary(filteredElement, correctDecision);
129             }
130         }
131 
132         // allow the next step to view the text in all necessary cases
133         return decision || correctDecision;
134     }
135 
136     /***
137      * Returns <code>true</code> if training the embedded filter is enabled
138      * (default).
139      *
140      * @return whether training is enabled
141      */
142     public boolean isTrainingEnabled() {
143         return trainingEnabled;
144     }
145 
146 }