1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.filter;
23
24 import java.io.File;
25 import java.io.IOException;
26
27 import org.apache.commons.lang.StringUtils;
28 import org.apache.commons.lang.builder.ToStringBuilder;
29 import org.dom4j.Document;
30 import org.dom4j.Element;
31
32 import de.fu_berlin.ties.ContextMap;
33 import de.fu_berlin.ties.ProcessingException;
34 import de.fu_berlin.ties.TiesConfiguration;
35 import de.fu_berlin.ties.extract.AnswerBuilder;
36 import de.fu_berlin.ties.extract.ExtractionContainer;
37 import de.fu_berlin.ties.extract.ExtractionLocator;
38 import de.fu_berlin.ties.extract.TargetStructure;
39 import de.fu_berlin.ties.io.IOUtils;
40 import de.fu_berlin.ties.text.TokenCounter;
41 import de.fu_berlin.ties.text.TokenDetails;
42 import de.fu_berlin.ties.text.TokenizerFactory;
43 import de.fu_berlin.ties.util.Util;
44 import de.fu_berlin.ties.xml.dom.DOMUtils;
45 import de.fu_berlin.ties.xml.dom.TokenProcessor;
46 import de.fu_berlin.ties.xml.dom.TokenWalker;
47
48 /***
49 * A variant of the prediction rewriter that uses predictions from
50 * another process (e.g. named entities) to provide additional semantic
51 * information. This variant does not modify the element structure of the
52 * document, but stores the predictions as XML attributes.
53 *
54 * <p>You should generally use this class instead of
55 * {@link de.fu_berlin.ties.filter.PredictionRewriter} since it generally has
56 * superior results.
57 * Instances of this class are not thread-safe and must not be used to
58 * process multiple documents in parallel.
59 *
60 * @author Christian Siefkes
61 * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:20 $, $Author: siefkes $
62 */
63 public class PredictionRewriter2 implements DocumentRewriter, TokenProcessor {
64
65 /***
66 * Name of the attribute to add.
67 */
68 public static final String ATTRIB_PRED = "pred";
69
70 /***
71 * Configuration key: "None" marker to use for tokens that do not belong to
72 * any prediction -- if empty or missing, these tokens are not tagged.
73 */
74 public static final String CONFIG_PRED_NONE = "rewriter.pred.none";
75
76
77 /***
78 * Used to configure this instance.
79 */
80 private final TiesConfiguration config;
81
82 /***
83 * Extension of the files containing predictions.
84 */
85 private final String extension;
86
87 /***
88 * "None" marker to use for tokens that do not belong to any prediction --
89 * if <code>null</code>, these tokens are not tagged (default behavior).
90 */
91 private final String noneMarker;
92
93 /***
94 * Prediction locator for the current document.
95 */
96 private ExtractionLocator predLocator;
97
98 /***
99 * Target structure to use for reading predictions.
100 */
101 private final TargetStructure targetStruct;
102
103 /***
104 * Factory used to create tokenizers.
105 */
106 private final TokenizerFactory tFactory;
107
108 /***
109 * Counts how often tokens are repeated in a document -- required to
110 * localize predictions.
111 */
112 private final TokenCounter tCount = new TokenCounter();
113
114
115 /***
116 * Creates a new instance.
117 *
118 * @param conf used to configure this instance; must not be
119 * <code>null</code>
120 * @throws ProcessingException if an error occurs while initializing the
121 * combination strategies
122 */
123 public PredictionRewriter2(final TiesConfiguration conf)
124 throws ProcessingException {
125 this(conf.getString(PredictionRewriter.CONFIG_PRED_EXT),
126 conf.getStringArray("rewriter.pred.classes"),
127 conf.getString(CONFIG_PRED_NONE, null),
128 new TokenizerFactory(conf), conf);
129 }
130
131 /***
132 * Creates a new instance.
133 *
134 * @param fileExtension extension of the files containing predictions
135 * @param predictionClasses names of the prediction classes to use --
136 * if empty array, all are used
137 * @param myNoneMarker "none" marker to use for tokens that do not belong to
138 * any prediction -- if empty or <code>null</code>, these tokens are not
139 * tagged
140 * @param factory used to instantiate tokenizers
141 * @param conf used to configure this instance; must not be
142 * <code>null</code>
143 * @throws ProcessingException if an error occurs while initializing the
144 * combination strategies
145 */
146 public PredictionRewriter2(final String fileExtension,
147 final String[] predictionClasses, final String myNoneMarker,
148 final TokenizerFactory factory, final TiesConfiguration conf)
149 throws ProcessingException {
150 super();
151 config = conf;
152 extension = fileExtension;
153 targetStruct = new TargetStructure(predictionClasses);
154 tFactory = factory;
155
156
157 if (StringUtils.isEmpty(myNoneMarker)) {
158 noneMarker = null;
159 } else {
160 noneMarker = myNoneMarker;
161 Util.LOG.debug("PredictionRewriter2: setting 'none' marker to '"
162 + noneMarker + "'");
163 }
164 }
165
166
167 /***
168 * Initializes a document to process, reading the corresponding prediction
169 * file(s).
170 *
171 * @param filename the file name of the current document
172 * @return <code>true</code> iff any predictions for this document exist
173 * @throws IOException if an I/O error occurs
174 */
175 private boolean initDocument(final File filename)
176 throws IOException {
177 final File directory = filename.getParentFile();
178 final String localName = filename.getName();
179 final String prefix =
180 IOUtils.getBaseName(localName) + IOUtils.EXT_SEPARATOR;
181 File predFile;
182 ExtractionContainer predictions;
183
184
185 tCount.clear();
186
187
188 if (predLocator != null) {
189 predLocator.reachedEndOfDocument();
190 }
191
192
193 predFile = new File(directory, prefix + extension);
194
195 if (predFile.exists()) {
196 predictions = AnswerBuilder.readAnswerKeys(targetStruct,
197 predFile, config);
198 predLocator = new ExtractionLocator(predictions,
199 tFactory.createTokenizer(""));
200 return true;
201 } else {
202 Util.LOG.info("No '" + extension + "' file found for "
203 + localName + " -- assuming there are no predictions");
204 predLocator = null;
205 return false;
206 }
207 }
208
209 /***
210 * {@inheritDoc}
211 */
212 public void processToken(final Element element, final String left,
213 final TokenDetails details, final String right,
214 final ContextMap context) throws IOException {
215 final String token = details.getToken();
216 tCount.add(false, token);
217 final int tokenRep = tCount.getLastRep();
218 final String predType;
219
220
221 if (predLocator != null) {
222
223 predLocator.startOfExtraction(token, tokenRep);
224
225 if (predLocator.inExtraction()) {
226 predType = predLocator.getCurrentExtraction().getType();
227
228
229 predLocator.updateExtraction(token, tokenRep);
230
231
232 if (predLocator.endOfExtraction()) {
233
234 predLocator.switchToNextExtraction();
235 }
236 } else {
237
238 predType = noneMarker;
239 }
240 } else {
241
242 predType = noneMarker;
243 }
244
245 if (predType != null) {
246 final String oldAttribValue = element.attributeValue(ATTRIB_PRED);
247
248
249
250 if (oldAttribValue == null || oldAttribValue.equals(noneMarker)) {
251 element.addAttribute(ATTRIB_PRED, predType);
252 } else if (!oldAttribValue.equals(predType)) {
253
254 Util.LOG.debug("Could not add " + predType
255 + " prediction since there is a " + ATTRIB_PRED
256 + "='" + oldAttribValue + "' attribute "
257 + DOMUtils.showToken(element, token));
258 }
259 }
260 }
261
262 /***
263 * {@inheritDoc}
264 */
265 public Document rewrite(final Document document, final File filename)
266 throws IOException, ProcessingException {
267 final ContextMap dummyContext = new ContextMap();
268
269 if (initDocument(filename)) {
270
271 final TokenWalker walker = new TokenWalker(this, tFactory);
272 walker.walk(document, dummyContext);
273 }
274
275 return document;
276 }
277
278 /***
279 * Returns a string representation of this object.
280 *
281 * @return a textual representation
282 */
283 public String toString() {
284 return new ToStringBuilder(this)
285 .append("extension", extension)
286 .toString();
287 }
288
289 }
290