1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.io.Writer;
28 import java.util.Arrays;
29 import java.util.HashSet;
30 import java.util.Set;
31
32 import org.apache.commons.lang.builder.ToStringBuilder;
33 import org.dom4j.Document;
34 import org.dom4j.Element;
35 import org.dom4j.NodeFilter;
36
37 import de.fu_berlin.ties.ContextMap;
38 import de.fu_berlin.ties.DocumentReader;
39 import de.fu_berlin.ties.ProcessingException;
40 import de.fu_berlin.ties.TiesConfiguration;
41
42 import de.fu_berlin.ties.io.FieldContainer;
43 import de.fu_berlin.ties.io.IOUtils;
44 import de.fu_berlin.ties.text.TokenContainer;
45 import de.fu_berlin.ties.text.TokenDetails;
46 import de.fu_berlin.ties.text.TokenizerFactory;
47 import de.fu_berlin.ties.util.Util;
48 import de.fu_berlin.ties.xml.dom.DOMUtils;
49 import de.fu_berlin.ties.xml.dom.DocumentWalker;
50 import de.fu_berlin.ties.xml.dom.ElementNameFilter;
51 import de.fu_berlin.ties.xml.dom.ElementProcessor;
52
53 /***
54 * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of answer
55 * keys from an annotated text (in XML format).
56 *
57 * <p>Instances of this class are thread-safe and can process several documents
58 * in parallel.
59 *
60 * @author Christian Siefkes
61 * @version $Revision: 1.28 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
62 */
63 public class AnswerBuilder extends DocumentReader implements ElementProcessor {
64
65 /***
66 * Context key referring to the extraction container used for storing the
67 * answer keys.
68 */
69 public static final String KEY_ANSWERS = "answer-keys";
70
71 /***
72 * The recommended file extension to use for storing answer keys.
73 */
74 public static final String EXT_ANSWERS = "ans";
75
76
77 /***
78 * Reads back answer keys stored by the
79 * {@link #process(Document, Writer, ContextMap)} method of an instance of
80 * this class.
81 *
82 * @param targetStruct the target structure used when creating the answer
83 * keys
84 * @param file the file containing the answer keys
85 * @param config configuration used to determine the character set of the
86 * keys (cf. {@link IOUtils#openReader(File,
87 * org.apache.commons.configuration.Configuration)}
88 * @return an extraction container of the answer keys
89 * @throws IllegalArgumentException if the
90 * (@linkplain de.fu_berlin.ties.classify.Prediction#getType() type) of
91 * some answer keys in the answer keys doesn't fit the target structure
92 * @throws IOException if an I/O error occurs while reading the file
93 */
94 public static ExtractionContainer readAnswerKeys(
95 final TargetStructure targetStruct, final File file,
96 final TiesConfiguration config)
97 throws IllegalArgumentException, IOException {
98 if (file.exists()) {
99 final Reader reader = IOUtils.openReader(file, config);
100 final FieldContainer fContainer =
101 FieldContainer.createFieldContainer(config);
102 fContainer.read(reader);
103 return new ExtractionContainer(targetStruct, fContainer);
104 } else {
105
106 Util.LOG.info("Answer key file " + file
107 + " does not exists -- assuming there are no answer keys");
108 return new ExtractionContainer(targetStruct);
109 }
110 }
111
112 /***
113 * Reads the answer keys corresponding to a file. The answer keys must be
114 * in a file ending in {@link AnswerBuilder#EXT_ANSWERS} instead of the
115 * extension of the original file.
116 *
117 * @param targetStruct the target structure used when creating the answer
118 * keys
119 * @param orgFile the file whose answer keys should be returned
120 * @param config configuration used to determine the character set of the
121 * keys (cf. {@link IOUtils#openReader(File,
122 * org.apache.commons.configuration.Configuration)}
123 * @return an extraction container of the answer keys
124 * @throws IllegalArgumentException if the
125 * (@linkplain de.fu_berlin.ties.classify.Prediction#getType() type) of
126 * some answer keys in the answer keys doesn't fit the target structure
127 * @throws IOException if an I/O error occurs while reading the file
128 */
129 public static ExtractionContainer readCorrespondingAnswerKeys(
130 final TargetStructure targetStruct, final File orgFile,
131 final TiesConfiguration config)
132 throws IllegalArgumentException, IOException {
133
134 final File answerFile = new File(orgFile.getParentFile(),
135 IOUtils.getBaseName(orgFile) + IOUtils.EXT_SEPARATOR
136 + EXT_ANSWERS);
137 return readAnswerKeys(targetStruct, answerFile, config);
138 }
139
140
141 /***
142 * The names of the elements to read attributes from.
143 */
144 private final Set<String> elementNames;
145
146 /***
147 * Optional attribute determining the type of extraction.
148 */
149 private final String typeAttrib;
150
151 /***
152 * The target structure specifying the classes to recognize.
153 */
154 private final TargetStructure targetStructure;
155
156 /***
157 * Used to instantiate tokenizers.
158 */
159 private final TokenizerFactory factory;
160
161
162 /***
163 * Creates a new instance, using a default extension and configuring the
164 * target structure from the {@linkplain TiesConfiguration#CONF standard
165 * configuration}.
166 */
167 public AnswerBuilder() {
168 this(EXT_ANSWERS);
169 }
170
171 /***
172 * Creates a new instance, configuring the target structure from the
173 * {@linkplain TiesConfiguration#CONF standard configuration}.
174 *
175 * @param outExt the extension to use for output files
176 */
177 public AnswerBuilder(final String outExt) {
178 this(outExt, TiesConfiguration.CONF);
179 }
180
181 /***
182 * Creates a new instance, configuring the target structure from the
183 * provided configuration.
184 *
185 * @param outExt the extension to use for output files
186 * @param config the configuration to use
187 */
188 public AnswerBuilder(final String outExt, final TiesConfiguration config) {
189 this(outExt, new TargetStructure(config), new TokenizerFactory(config),
190 config.getStringArray("answers.element"),
191 config.getString("answers.attrib", null),
192 config);
193 }
194
195 /***
196 * Creates a new instance.
197 *
198 * @param outExt the extension to use for output files
199 * @param targetStruct the target structure specifying the classes to
200 * recognize
201 * @param tFactory used to instantiate tokenizers
202 * @param myElementNames if not <code>null</code> or empty, answer types are
203 * read from the <code>myTypeAttrib</code> attribute of the elements
204 * specified in this list instead of using element names as types
205 * @param myTypeAttrib Name of attribute to read element types from if
206 * the elemant name shouldn't be used; must be <code>null</code> iff
207 * <code>myElementNames</code> is <code>null</code> or empty
208 * @param config the configuration to use
209 */
210 public AnswerBuilder(final String outExt,
211 final TargetStructure targetStruct, final TokenizerFactory tFactory,
212 final String[] myElementNames, final String myTypeAttrib,
213 final TiesConfiguration config) {
214 super(outExt, config);
215 targetStructure = targetStruct;
216 factory = tFactory;
217 typeAttrib = myTypeAttrib;
218
219 if (typeAttrib != null) {
220
221 elementNames = new HashSet<String>(Arrays.asList(myElementNames));
222 } else {
223
224 elementNames = targetStructure.getClassNames();
225 }
226 }
227
228
229 /***
230 * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of
231 * answer keys from from an annotated XML document.
232 *
233 * @param document the document to read
234 * @return a container of the answer keys of this document
235 * @throws IOException if an I/O error occurs
236 * @throws ProcessingException if an error occurs during processing
237 */
238 public ExtractionContainer buildAnswers(final Document document)
239 throws IOException, ProcessingException {
240
241 final ExtractionContainer answerKeys =
242 new ExtractionContainer(targetStructure);
243 final ContextMap context = new ContextMap();
244 context.put(KEY_ANSWERS, answerKeys);
245
246 final NodeFilter filter = new ElementNameFilter(elementNames);
247 final DocumentWalker walker = new DocumentWalker(filter, this, factory);
248
249
250
251 walker.walk(document, context);
252
253
254 return answerKeys;
255 }
256
257 /***
258 * Returns the target structure specifying the classes to recognize.
259 * @return the used target structure
260 */
261 public TargetStructure getTargetStructure() {
262 return targetStructure;
263 }
264
265 /***
266 * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of
267 * answer keys from from an annotated XML document.
268 *
269 * @param document the document to read
270 * @param writer the writer to write the processed text to; flushed
271 * but not closed by this method
272 * @param context a map of objects that are made available for processing
273 * @throws IOException if an I/O error occurs
274 * @throws ProcessingException if an error occurs during processing
275 */
276 public void process(final Document document, final Writer writer,
277 final ContextMap context) throws IOException, ProcessingException {
278
279 final ExtractionContainer answers = buildAnswers(document);
280
281
282 final FieldContainer storage =
283 FieldContainer.createFieldContainer(getConfig());
284 answers.storeEntries(storage);
285 storage.store(writer);
286 }
287
288 /***
289 * Builds an answer key from an element in an XML document.
290 *
291 * @param element the element to process
292 * @param tokenContainer a container storing all tokens seen in the
293 * document so far; {@link TokenContainer#getLast()} contains the textual
294 * content of the element and its child elements
295 * @param context a map of objects that are made available for processing;
296 * the {@link #KEY_ANSWERS} key must map to an extraction container used for
297 * storing the answer keys
298 */
299 public void processElement(final Element element,
300 final TokenContainer tokenContainer, final ContextMap context) {
301
302 final TokenDetails details = new TokenDetails(tokenContainer.getLast(),
303 tokenContainer.getFirstTokenInLastRep(),
304 tokenContainer.getFirstTokenInLastIndex(), false);
305
306
307 final String extractionType = (typeAttrib != null)
308 ? element.attributeValue(typeAttrib) : DOMUtils.name(element);
309 final Extraction extraction =
310 new Extraction(extractionType, details);
311 final ExtractionContainer answerKeys =
312 (ExtractionContainer) context.get(KEY_ANSWERS);
313 answerKeys.add(extraction);
314 }
315
316 /***
317 * Returns a string representation of this object.
318 * @return a textual representation
319 */
320 public String toString() {
321 return new ToStringBuilder(this)
322 .appendSuper(super.toString())
323 .append("target structure", targetStructure)
324 .toString();
325 }
326
327 }