View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.filter;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.util.HashMap;
27  import java.util.List;
28  import java.util.Map;
29  
30  import org.apache.commons.lang.builder.ToStringBuilder;
31  import org.dom4j.Document;
32  import org.dom4j.Element;
33  import org.dom4j.NodeFilter;
34  
35  import de.fu_berlin.ties.ProcessingException;
36  import de.fu_berlin.ties.TiesConfiguration;
37  import de.fu_berlin.ties.classify.Reranker;
38  import de.fu_berlin.ties.classify.feature.FeatureVector;
39  import de.fu_berlin.ties.context.LocalFeature;
40  import de.fu_berlin.ties.context.PriorRecognitions;
41  import de.fu_berlin.ties.context.Representation;
42  import de.fu_berlin.ties.extract.Extraction;
43  import de.fu_berlin.ties.text.TokenizerFactory;
44  import de.fu_berlin.ties.xml.dom.DOMUtils;
45  
46  /***
47   * A trainable filter that uses a
48   * {@link de.fu_berlin.ties.context.Representation}to convert elements into
49   * feature vectors.
50   * 
51   * @author Christian Siefkes
52   * @version $Revision: 1.16 $, $Date: 2006/10/21 16:04:20 $, $Author: siefkes $
53   */
54  public class RepresentationFilter extends TrainableFilter {
55  
56      /***
57       * The representation used to convert elements into feature vectors.
58       */
59      private final Representation representation;
60  
61      /***
62       * Feature cache used by the representation.
63       */
64      private final Map<Element, List<LocalFeature>> featureCache =
65          new HashMap<Element, List<LocalFeature>>();
66  
67      /***
68       * The purpose of this filter, e.g. "Sentence" if it is used for sentence
69       * filtering.
70       */
71      private final String purpose;
72  
73      /***
74       * A factory for creating tokenizers.
75       */
76      private final TokenizerFactory tFactory;
77  
78      /***
79       * List of prior recognitions used by the representation.
80       */
81      private PriorRecognitions priorRecogs;
82  
83      /***
84       * Creates a new instance.
85       * 
86       * @param conf used to initialize the trainable classifier by calling {@link
87       * de.fu_berlin.ties.classify.TrainableClassifier#createClassifier(
88       * java.util.Set, TiesConfiguration, String)} with an optional "filter"
89       * suffix
90       * @param positiveFilter this filter is queried to decide whether this class
91       * {@link #prefers(Element) prefers}an element
92       * @param negativeFilter this filter is queried to decide whether this class
93       * {@link #avoids(Element) avoids}an element
94       * @param rerank an optional reranker that recalculates probabilities to
95       * introduce a bias (can be used to favor recall over precision, by setting
96       * a bias &gt; 1 for the "true" class, etc.); might be <code>null</code>
97       * @param rep the representation used to convert elements into feature
98       * vectors
99       * @param filterPurpose the purpose of this filter, e.g. "Sentence" if it is
100      * used for sentence filtering
101      * @throws ProcessingException if the initialization of the trainable
102      * classifier fails
103      */
104     public RepresentationFilter(final TiesConfiguration conf,
105             final NodeFilter positiveFilter, final NodeFilter negativeFilter,
106             final Reranker rerank, final Representation rep,
107             final String filterPurpose) throws ProcessingException {
108         super(conf, positiveFilter, negativeFilter, rerank);
109         representation = rep;
110         purpose = filterPurpose;
111         tFactory = new TokenizerFactory(conf);
112     }
113 
114     /***
115      * {@inheritDoc}
116      */
117     public FeatureVector buildFeatures(final Element element) {
118         // collect textual content of element
119         final StringBuilder mainText = new StringBuilder();
120         DOMUtils.collectText(element, mainText);
121 
122         // don't use any left or right text
123         return representation.buildContext(element, "", mainText.toString(),
124                 "", priorRecogs, featureCache, purpose);
125     }
126 
127     /***
128      * {@inheritDoc}
129      */
130     public void init(final Document document, final File filename)
131     throws ProcessingException, IOException  {
132         // reset feature cache + prior recognitions
133         featureCache.clear();
134         priorRecogs = representation.initDocument(filename, tFactory);
135 
136         // delegate to superclass, just to make sure
137         super.init(document, filename);
138     }
139 
140     /***
141      * {@inheritDoc}
142      */
143     public boolean matches(final Element element) throws ProcessingException {
144         // delegate to super class
145         final boolean result = super.matches(element);
146 
147         // update prior recognitions
148         final Extraction newRecognition;
149 
150         if (result) {
151             // positive result: use element name as type
152             final StringBuilder collectedText = new StringBuilder();
153             DOMUtils.collectText(element, collectedText);
154             newRecognition = new Extraction(DOMUtils.name(element),
155                     collectedText.toString());
156         } else {
157             // negative result: use "<null>" as type and empty string as text
158             newRecognition = new Extraction("<null>", "");
159         }
160 
161         // seal and add recognition
162         newRecognition.setSealed(true);
163         priorRecogs.add(newRecognition);
164 
165         return result;
166     }
167 
168     /***
169      * Returns a string representation of this object.
170      * 
171      * @return a textual representation
172      */
173     public String toString() {
174         return new ToStringBuilder(this)
175                 .appendSuper(super.toString())
176                 .append("representation", representation)
177                 .toString();
178     }
179 
180 }