1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.filter;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.util.HashMap;
27 import java.util.List;
28 import java.util.Map;
29
30 import org.apache.commons.lang.builder.ToStringBuilder;
31 import org.dom4j.Document;
32 import org.dom4j.Element;
33 import org.dom4j.NodeFilter;
34
35 import de.fu_berlin.ties.ProcessingException;
36 import de.fu_berlin.ties.TiesConfiguration;
37 import de.fu_berlin.ties.classify.Reranker;
38 import de.fu_berlin.ties.classify.feature.FeatureVector;
39 import de.fu_berlin.ties.context.LocalFeature;
40 import de.fu_berlin.ties.context.PriorRecognitions;
41 import de.fu_berlin.ties.context.Representation;
42 import de.fu_berlin.ties.extract.Extraction;
43 import de.fu_berlin.ties.text.TokenizerFactory;
44 import de.fu_berlin.ties.xml.dom.DOMUtils;
45
46 /***
47 * A trainable filter that uses a
48 * {@link de.fu_berlin.ties.context.Representation}to convert elements into
49 * feature vectors.
50 *
51 * @author Christian Siefkes
52 * @version $Revision: 1.16 $, $Date: 2006/10/21 16:04:20 $, $Author: siefkes $
53 */
54 public class RepresentationFilter extends TrainableFilter {
55
56 /***
57 * The representation used to convert elements into feature vectors.
58 */
59 private final Representation representation;
60
61 /***
62 * Feature cache used by the representation.
63 */
64 private final Map<Element, List<LocalFeature>> featureCache =
65 new HashMap<Element, List<LocalFeature>>();
66
67 /***
68 * The purpose of this filter, e.g. "Sentence" if it is used for sentence
69 * filtering.
70 */
71 private final String purpose;
72
73 /***
74 * A factory for creating tokenizers.
75 */
76 private final TokenizerFactory tFactory;
77
78 /***
79 * List of prior recognitions used by the representation.
80 */
81 private PriorRecognitions priorRecogs;
82
83 /***
84 * Creates a new instance.
85 *
86 * @param conf used to initialize the trainable classifier by calling {@link
87 * de.fu_berlin.ties.classify.TrainableClassifier#createClassifier(
88 * java.util.Set, TiesConfiguration, String)} with an optional "filter"
89 * suffix
90 * @param positiveFilter this filter is queried to decide whether this class
91 * {@link #prefers(Element) prefers}an element
92 * @param negativeFilter this filter is queried to decide whether this class
93 * {@link #avoids(Element) avoids}an element
94 * @param rerank an optional reranker that recalculates probabilities to
95 * introduce a bias (can be used to favor recall over precision, by setting
96 * a bias > 1 for the "true" class, etc.); might be <code>null</code>
97 * @param rep the representation used to convert elements into feature
98 * vectors
99 * @param filterPurpose the purpose of this filter, e.g. "Sentence" if it is
100 * used for sentence filtering
101 * @throws ProcessingException if the initialization of the trainable
102 * classifier fails
103 */
104 public RepresentationFilter(final TiesConfiguration conf,
105 final NodeFilter positiveFilter, final NodeFilter negativeFilter,
106 final Reranker rerank, final Representation rep,
107 final String filterPurpose) throws ProcessingException {
108 super(conf, positiveFilter, negativeFilter, rerank);
109 representation = rep;
110 purpose = filterPurpose;
111 tFactory = new TokenizerFactory(conf);
112 }
113
114 /***
115 * {@inheritDoc}
116 */
117 public FeatureVector buildFeatures(final Element element) {
118
119 final StringBuilder mainText = new StringBuilder();
120 DOMUtils.collectText(element, mainText);
121
122
123 return representation.buildContext(element, "", mainText.toString(),
124 "", priorRecogs, featureCache, purpose);
125 }
126
127 /***
128 * {@inheritDoc}
129 */
130 public void init(final Document document, final File filename)
131 throws ProcessingException, IOException {
132
133 featureCache.clear();
134 priorRecogs = representation.initDocument(filename, tFactory);
135
136
137 super.init(document, filename);
138 }
139
140 /***
141 * {@inheritDoc}
142 */
143 public boolean matches(final Element element) throws ProcessingException {
144
145 final boolean result = super.matches(element);
146
147
148 final Extraction newRecognition;
149
150 if (result) {
151
152 final StringBuilder collectedText = new StringBuilder();
153 DOMUtils.collectText(element, collectedText);
154 newRecognition = new Extraction(DOMUtils.name(element),
155 collectedText.toString());
156 } else {
157
158 newRecognition = new Extraction("<null>", "");
159 }
160
161
162 newRecognition.setSealed(true);
163 priorRecogs.add(newRecognition);
164
165 return result;
166 }
167
168 /***
169 * Returns a string representation of this object.
170 *
171 * @return a textual representation
172 */
173 public String toString() {
174 return new ToStringBuilder(this)
175 .appendSuper(super.toString())
176 .append("representation", representation)
177 .toString();
178 }
179
180 }