1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.io.Writer;
28
29 import org.apache.commons.configuration.Configuration;
30 import org.apache.commons.lang.builder.ToStringBuilder;
31 import org.dom4j.Document;
32 import org.dom4j.Element;
33 import org.dom4j.NodeFilter;
34
35 import de.fu_berlin.ties.ContextMap;
36 import de.fu_berlin.ties.DocumentReader;
37 import de.fu_berlin.ties.ProcessingException;
38 import de.fu_berlin.ties.TiesConfiguration;
39
40 import de.fu_berlin.ties.io.FieldContainer;
41 import de.fu_berlin.ties.io.IOUtils;
42 import de.fu_berlin.ties.text.TokenContainer;
43 import de.fu_berlin.ties.text.TokenDetails;
44 import de.fu_berlin.ties.text.TokenizerFactory;
45 import de.fu_berlin.ties.xml.dom.DOMUtils;
46 import de.fu_berlin.ties.xml.dom.DocumentWalker;
47 import de.fu_berlin.ties.xml.dom.ElementNameFilter;
48 import de.fu_berlin.ties.xml.dom.ElementProcessor;
49
50 /***
51 * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of answer
52 * keys from an annotated text (in XML format).
53 *
54 * <p>Instances of this class are thread-safe and can process several documents
55 * in parallel.
56 *
57 * @author Christian Siefkes
58 * @version $Revision: 1.20 $, $Date: 2004/11/19 14:04:41 $, $Author: siefkes $
59 */
60 public class AnswerBuilder extends DocumentReader implements ElementProcessor {
61
62 /***
63 * Context key referring to the extraction container used for storing the
64 * answer keys.
65 */
66 public static final String KEY_ANSWERS = "answer-keys";
67
68 /***
69 * The recommended file extension to use for storing answer keys.
70 */
71 public static final String EXT_ANSWERS = "ans";
72
73 /***
74 * Reads back answer keys stored by the
75 * {@link #process(Document, Writer, ContextMap)} method of an instance of
76 * this class.
77 *
78 * @param targetStruct the target structure used when creating the answer
79 * keys
80 * @param file the file containing the answer keys
81 * @param config configuration used to determine the character set of the
82 * keys (cf. {@link IOUtils#openReader(File, Configuration)}
83 * @return an extraction container of the answer keys
84 * @throws IllegalArgumentException if the
85 * (@linkplain de.fu_berlin.ties.classify.Prediction#getType() type) of
86 * some answer keys in the answer keys doesn't fit the target structure
87 * @throws IOException if an I/O error occurs while reading the file
88 */
89 public static ExtractionContainer readAnswerKeys(
90 final TargetStructure targetStruct, final File file,
91 final Configuration config)
92 throws IllegalArgumentException, IOException {
93 final Reader reader = IOUtils.openReader(file, config);
94 final FieldContainer fContainer =
95 FieldContainer.createFieldContainer(reader);
96 return new ExtractionContainer(targetStruct, fContainer);
97 }
98
99 /***
100 * Reads the answer keys corresponding to a file. The answer keys must be
101 * in a file ending in {@link AnswerBuilder#EXT_ANSWERS} instead of the
102 * extension of the original file.
103 *
104 * @param targetStruct the target structure used when creating the answer
105 * keys
106 * @param orgFile the file whose answer keys should be returned
107 * @param config configuration used to determine the character set of the
108 * keys (cf. {@link IOUtils#openReader(File, Configuration)}
109 * @return an extraction container of the answer keys
110 * @throws IllegalArgumentException if the
111 * (@linkplain de.fu_berlin.ties.classify.Prediction#getType() type) of
112 * some answer keys in the answer keys doesn't fit the target structure
113 * @throws IOException if an I/O error occurs while reading the file
114 */
115 public static ExtractionContainer readCorrespondingAnswerKeys(
116 final TargetStructure targetStruct, final File orgFile,
117 final Configuration config)
118 throws IllegalArgumentException, IOException {
119
120 final File answerFile = new File(orgFile.getParentFile(),
121 IOUtils.getBaseName(orgFile) + IOUtils.EXT_SEPARATOR
122 + EXT_ANSWERS);
123 return readAnswerKeys(targetStruct, answerFile, config);
124 }
125
126 /***
127 * The target structure specifying the classes to recognize.
128 */
129 private final TargetStructure targetStructure;
130
131 /***
132 * Used to instantiate tokenizers.
133 */
134 private final TokenizerFactory factory;
135
136 /***
137 * Creates a new instance, using a default extension and configuring the
138 * target structure from the {@linkplain TiesConfiguration#CONF standard
139 * configuration}.
140 */
141 public AnswerBuilder() {
142 this(EXT_ANSWERS);
143 }
144
145 /***
146 * Creates a new instance, configuring the target structure from the
147 * {@linkplain TiesConfiguration#CONF standard configuration}.
148 *
149 * @param outExt the extension to use for output files
150 */
151 public AnswerBuilder(final String outExt) {
152 this(outExt, TiesConfiguration.CONF);
153 }
154
155 /***
156 * Creates a new instance, configuring the target structure from the
157 * provided configuration.
158 *
159 * @param outExt the extension to use for output files
160 * @param config the configuration to use
161 */
162 public AnswerBuilder(final String outExt, final TiesConfiguration config) {
163 this(outExt, new TargetStructure(config), new TokenizerFactory(config),
164 config);
165 }
166
167 /***
168 * Creates a new instance.
169 * @param outExt the extension to use for output files
170 * @param targetStruct the target structure specifying the classes to
171 * recognize
172 * @param tFactory used to instantiate tokenizers
173 * @param config the configuration to use
174 */
175 public AnswerBuilder(final String outExt,
176 final TargetStructure targetStruct, final TokenizerFactory tFactory,
177 final TiesConfiguration config) {
178 super(outExt, config);
179 targetStructure = targetStruct;
180 factory = tFactory;
181 }
182
183 /***
184 * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of
185 * answer keys from from an annotated XML document.
186 *
187 * @param document the document to read
188 * @return a container of the answer keys of this document
189 * @throws IOException if an I/O error occurs
190 * @throws ProcessingException if an error occurs during processing
191 */
192 public ExtractionContainer buildAnswers(final Document document)
193 throws IOException, ProcessingException {
194
195 final ExtractionContainer answerKeys =
196 new ExtractionContainer(targetStructure);
197 final ContextMap context = new ContextMap();
198 context.put(KEY_ANSWERS, answerKeys);
199
200 final NodeFilter filter =
201 new ElementNameFilter(targetStructure.getClassNames());
202 final DocumentWalker walker = new DocumentWalker(filter, this, factory);
203
204
205
206 walker.walk(document, context);
207
208
209 return answerKeys;
210 }
211
212 /***
213 * Returns the target structure specifying the classes to recognize.
214 * @return the used target structure
215 */
216 public TargetStructure getTargetStructure() {
217 return targetStructure;
218 }
219
220 /***
221 * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of
222 * answer keys from from an annotated XML document.
223 *
224 * @param document the document to read
225 * @param writer the writer to write the processed text to; flushed
226 * but not closed by this method
227 * @param context a map of objects that are made available for processing
228 * @throws IOException if an I/O error occurs
229 * @throws ProcessingException if an error occurs during processing
230 */
231 public void process(final Document document, final Writer writer,
232 final ContextMap context) throws IOException, ProcessingException {
233
234 final ExtractionContainer answers = buildAnswers(document);
235
236
237 final FieldContainer storage = FieldContainer.createFieldContainer();
238 answers.storeEntries(storage);
239 storage.store(writer);
240 }
241
242 /***
243 * Classifies an element in an XML document, building features and
244 * delegating to the classifier.
245 *
246 * @param element the element to process
247 * @param tokenContainer a container storing all tokens seen in the
248 * document so far; {@link TokenContainer#getLast()} contains the textual
249 * content of the element and its child elements
250 * @param context a map of objects that are made available for processing;
251 * the {@link #KEY_ANSWERS} key must map to an extraction container used for
252 * storing the answer keys
253 */
254 public void processElement(final Element element,
255 final TokenContainer tokenContainer, final ContextMap context) {
256
257 final TokenDetails details = new TokenDetails(tokenContainer.getLast(),
258 tokenContainer.getFirstTokenInLastRep(),
259 tokenContainer.getFirstTokenInLastIndex(), false);
260 final Extraction extraction =
261 new Extraction(DOMUtils.name(element), details);
262 final ExtractionContainer answerKeys =
263 (ExtractionContainer) context.get(KEY_ANSWERS);
264 answerKeys.add(extraction);
265 }
266
267 /***
268 * Returns a string representation of this object.
269 * @return a textual representation
270 */
271 public String toString() {
272 return new ToStringBuilder(this)
273 .appendSuper(super.toString())
274 .append("target structure", targetStructure)
275 .toString();
276 }
277
278 }