View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.filter;
23  
24  import java.util.HashMap;
25  import java.util.List;
26  import java.util.Map;
27  
28  import org.apache.commons.lang.builder.ToStringBuilder;
29  import org.dom4j.Document;
30  import org.dom4j.Element;
31  import org.dom4j.NodeFilter;
32  
33  import de.fu_berlin.ties.ProcessingException;
34  import de.fu_berlin.ties.TiesConfiguration;
35  import de.fu_berlin.ties.classify.Reranker;
36  import de.fu_berlin.ties.classify.feature.FeatureVector;
37  import de.fu_berlin.ties.context.LocalFeature;
38  import de.fu_berlin.ties.context.PriorRecognitions;
39  import de.fu_berlin.ties.context.Representation;
40  import de.fu_berlin.ties.extract.Extraction;
41  import de.fu_berlin.ties.xml.dom.DOMUtils;
42  
43  /***
44   * A trainable filter that uses a
45   * {@link de.fu_berlin.ties.context.Representation}to convert elements into
46   * feature vectors.
47   * 
48   * @author Christian Siefkes
49   * @version $Revision: 1.9 $, $Date: 2004/11/19 14:05:00 $, $Author: siefkes $
50   */
51  public class RepresentationFilter extends TrainableFilter {
52  
53      /***
54       * The representation used to convert elements into feature vectors.
55       */
56      private final Representation representation;
57  
58      /***
59       * Feature cache used by the representation.
60       */
61      private final Map<Element, List<LocalFeature>> featureCache =
62          new HashMap<Element, List<LocalFeature>>();
63  
64      /***
65       * The purpose of this filter, e.g. "Sentence" if it is used for sentence
66       * filtering.
67       */
68      private final String purpose;
69  
70      /***
71       * List of prior recognitions used by the representation.
72       */
73      private PriorRecognitions priorRecogs;
74  
75      /***
76       * Creates a new instance.
77       * 
78       * @param conf used to initialize the trainable classifier by calling {@link
79       * de.fu_berlin.ties.classify.TrainableClassifier#createClassifier(
80       * java.util.Set, TiesConfiguration)}
81       * @param positiveFilter this filter is queried to decide whether this class
82       * {@link #prefers(Element) prefers}an element
83       * @param negativeFilter this filter is queried to decide whether this class
84       * {@link #avoids(Element) avoids}an element
85       * @param rerank an optional reranker that recalculates probabilities to
86       * introduce a bias (can be used to favor recall over precision, by setting
87       * a bias &gt; 1 for the "true" class, etc.); might be <code>null</code>
88       * @param rep the representation used to convert elements into feature
89       * vectors
90       * @param filterPurpose the purpose of this filter, e.g. "Sentence" if it is
91       * used for sentence filtering
92       * @throws ProcessingException if the initialization of the trainable
93       * classifier fails
94       */
95      public RepresentationFilter(final TiesConfiguration conf,
96              final NodeFilter positiveFilter, final NodeFilter negativeFilter,
97              final Reranker rerank, final Representation rep,
98              final String filterPurpose) throws ProcessingException {
99          super(conf, positiveFilter, negativeFilter, rerank);
100         representation = rep;
101         purpose = filterPurpose;
102     }
103 
104     /***
105      * {@inheritDoc}
106      */
107     public FeatureVector buildFeatures(final Element element) {
108         // collect textual content of element
109         final StringBuffer mainText = new StringBuffer();
110         DOMUtils.collectText(element, mainText);
111 
112         // don't use any left or right text
113         return representation.buildContext(element, "", mainText.toString(),
114                 "", priorRecogs, featureCache, purpose);
115     }
116 
117     /***
118      * {@inheritDoc}
119      */
120     public void init(final Document document) {
121         // reset feature cache + prior recognitions
122         featureCache.clear();
123         priorRecogs = representation.createRecognitionBuffer();
124 
125         // delegate to superclass, just to make sure
126         super.init(document);
127     }
128 
129     /***
130      * {@inheritDoc}
131      */
132     public boolean matches(final Element element) throws ProcessingException {
133         // delegate to super class
134         final boolean result = super.matches(element);
135 
136         // update prior recognitions
137         final Extraction newRecognition;
138 
139         if (result) {
140             // positive result: use element name as type
141             final StringBuffer collectedText = new StringBuffer();
142             DOMUtils.collectText(element, collectedText);
143             newRecognition = new Extraction(DOMUtils.name(element),
144                     collectedText.toString());
145         } else {
146             // negative result: use "<null>" as type and empty string as text
147             newRecognition = new Extraction("<null>", "");
148         }
149 
150         // seal and add recognition
151         newRecognition.setSealed(true);
152         priorRecogs.add(newRecognition);
153 
154         return result;
155     }
156 
157     /***
158      * Returns a string representation of this object.
159      * 
160      * @return a textual representation
161      */
162     public String toString() {
163         return new ToStringBuilder(this)
164                 .appendSuper(super.toString())
165                 .append("representation", representation)
166                 .toString();
167     }
168 
169 }