View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Reader;
27  import java.io.Writer;
28  
29  import org.apache.commons.configuration.Configuration;
30  import org.apache.commons.lang.builder.ToStringBuilder;
31  import org.dom4j.Document;
32  import org.dom4j.Element;
33  import org.dom4j.NodeFilter;
34  
35  import de.fu_berlin.ties.ContextMap;
36  import de.fu_berlin.ties.DocumentReader;
37  import de.fu_berlin.ties.ProcessingException;
38  import de.fu_berlin.ties.TiesConfiguration;
39  
40  import de.fu_berlin.ties.io.FieldContainer;
41  import de.fu_berlin.ties.io.IOUtils;
42  import de.fu_berlin.ties.text.TokenContainer;
43  import de.fu_berlin.ties.text.TokenDetails;
44  import de.fu_berlin.ties.text.TokenizerFactory;
45  import de.fu_berlin.ties.xml.dom.DOMUtils;
46  import de.fu_berlin.ties.xml.dom.DocumentWalker;
47  import de.fu_berlin.ties.xml.dom.ElementNameFilter;
48  import de.fu_berlin.ties.xml.dom.ElementProcessor;
49  
50  /***
51   * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of answer
52   * keys from an annotated text (in XML format).
53   *
54   * <p>Instances of this class are thread-safe and can process several documents
55   * in parallel.
56   *
57   * @author Christian Siefkes
58   * @version $Revision: 1.20 $, $Date: 2004/11/19 14:04:41 $, $Author: siefkes $
59   */
60  public class AnswerBuilder extends DocumentReader implements ElementProcessor {
61  
62      /***
63       * Context key referring to the extraction container used for storing the
64       * answer keys.
65       */
66      public static final String KEY_ANSWERS = "answer-keys";
67  
68      /***
69       * The recommended file extension to use for storing answer keys.
70       */
71      public static final String EXT_ANSWERS = "ans";
72  
73      /***
74       * Reads back answer keys stored by the
75       * {@link #process(Document, Writer, ContextMap)} method of an instance of
76       * this class.
77       *
78       * @param targetStruct the target structure used when creating the answer
79       * keys
80       * @param file the file containing the answer keys
81       * @param config configuration used to determine the character set of the
82       * keys (cf. {@link IOUtils#openReader(File, Configuration)}
83       * @return an extraction container of the answer keys
84       * @throws IllegalArgumentException if the
85       * (@linkplain de.fu_berlin.ties.classify.Prediction#getType() type) of
86       * some answer keys in the answer keys doesn't fit the target structure
87       * @throws IOException if an I/O error occurs while reading the file
88       */
89      public static ExtractionContainer readAnswerKeys(
90              final TargetStructure targetStruct, final File file,
91              final Configuration config)
92              throws IllegalArgumentException, IOException {
93          final Reader reader = IOUtils.openReader(file, config);
94          final FieldContainer fContainer =
95              FieldContainer.createFieldContainer(reader);
96          return new ExtractionContainer(targetStruct, fContainer);
97      }
98  
99      /***
100      * Reads the answer keys corresponding to a file. The answer keys must be
101      * in a file ending in {@link AnswerBuilder#EXT_ANSWERS} instead of the
102      * extension of the original file.
103      *
104      * @param targetStruct the target structure used when creating the answer
105      * keys
106      * @param orgFile the file whose answer keys should be returned
107      * @param config configuration used to determine the character set of the
108      * keys (cf. {@link IOUtils#openReader(File, Configuration)}
109      * @return an extraction container of the answer keys
110      * @throws IllegalArgumentException if the
111      * (@linkplain de.fu_berlin.ties.classify.Prediction#getType() type) of
112      * some answer keys in the answer keys doesn't fit the target structure
113      * @throws IOException if an I/O error occurs while reading the file
114      */
115     public static ExtractionContainer readCorrespondingAnswerKeys(
116             final TargetStructure targetStruct, final File orgFile,
117             final Configuration config)
118             throws IllegalArgumentException, IOException {
119         // replace extension + delegate
120         final File answerFile = new File(orgFile.getParentFile(),
121                 IOUtils.getBaseName(orgFile) + IOUtils.EXT_SEPARATOR
122                 + EXT_ANSWERS);
123         return readAnswerKeys(targetStruct, answerFile, config);
124     }
125 
126     /***
127      * The target structure specifying the classes to recognize.
128      */
129     private final TargetStructure targetStructure;
130 
131     /***
132      * Used to instantiate tokenizers.
133      */
134     private final TokenizerFactory factory;
135 
136     /***
137      * Creates a new instance, using a default extension and configuring the
138      * target structure from the {@linkplain TiesConfiguration#CONF standard
139      * configuration}.
140      */
141     public AnswerBuilder() {
142         this(EXT_ANSWERS);
143     }
144 
145     /***
146      * Creates a new instance, configuring the target structure from the
147      * {@linkplain TiesConfiguration#CONF standard configuration}.
148      *
149      * @param outExt the extension to use for output files
150      */
151     public AnswerBuilder(final String outExt) {
152         this(outExt, TiesConfiguration.CONF);
153     }
154 
155     /***
156      * Creates a new instance, configuring the target structure from the
157      * provided configuration.
158      *
159      * @param outExt the extension to use for output files
160      * @param config the configuration to use
161      */
162     public AnswerBuilder(final String outExt, final TiesConfiguration config) {
163         this(outExt, new TargetStructure(config), new TokenizerFactory(config),
164             config);
165     }
166 
167     /***
168      * Creates a new instance.
169      * @param outExt the extension to use for output files
170      * @param targetStruct the target structure specifying the classes to
171      * recognize
172      * @param tFactory used to instantiate tokenizers
173      * @param config the configuration to use
174      */
175     public AnswerBuilder(final String outExt,
176             final TargetStructure targetStruct, final TokenizerFactory tFactory,
177             final TiesConfiguration config) {
178         super(outExt, config);
179         targetStructure = targetStruct;
180         factory = tFactory;
181     }
182 
183     /***
184      * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of
185      * answer keys from from an annotated XML document.
186      *
187      * @param document the document to read
188      * @return a container of the answer keys of this document
189      * @throws IOException if an I/O error occurs
190      * @throws ProcessingException if an error occurs during processing
191      */
192     public ExtractionContainer buildAnswers(final Document document)
193             throws IOException, ProcessingException {
194         // create container of answer keys and store in context
195         final ExtractionContainer answerKeys =
196             new ExtractionContainer(targetStructure);
197         final ContextMap context = new ContextMap();
198         context.put(KEY_ANSWERS, answerKeys);
199 
200         final NodeFilter filter =
201             new ElementNameFilter(targetStructure.getClassNames());
202         final DocumentWalker walker = new DocumentWalker(filter, this, factory);
203 
204         // the walker will call back (process method) for each element matching
205         // a class name
206         walker.walk(document, context);
207 
208         // return the container of answer keys
209         return answerKeys;
210     }
211 
212     /***
213      * Returns the target structure specifying the classes to recognize.
214      * @return the used target structure
215      */
216     public TargetStructure getTargetStructure() {
217         return targetStructure;
218     }
219 
220     /***
221      * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of
222      * answer keys from from an annotated XML document.
223      *
224      * @param document the document to read
225      * @param writer the writer to write the processed text to; flushed
226      * but not closed by this method
227      * @param context a map of objects that are made available for processing
228      * @throws IOException if an I/O error occurs
229      * @throws ProcessingException if an error occurs during processing
230      */
231     public void process(final Document document, final Writer writer,
232             final ContextMap context) throws IOException, ProcessingException {
233         // delegate to build method
234         final ExtractionContainer answers = buildAnswers(document);
235 
236         // serialize results
237         final FieldContainer storage = FieldContainer.createFieldContainer();
238         answers.storeEntries(storage);
239         storage.store(writer);
240     }
241 
242     /***
243      * Classifies an element in an XML document, building features and
244      * delegating to the classifier.
245      *
246      * @param element the element to process
247      * @param tokenContainer a container storing all tokens seen in the
248      * document so far; {@link TokenContainer#getLast()} contains the textual
249      * content of the element and its child elements
250      * @param context a map of objects that are made available for processing;
251      * the {@link #KEY_ANSWERS} key must map to an extraction container used for
252      * storing the answer keys
253      */
254     public void processElement(final Element element,
255             final TokenContainer tokenContainer, final ContextMap context) {
256         // create and store an extraction of the element type
257         final TokenDetails details = new TokenDetails(tokenContainer.getLast(),
258                 tokenContainer.getFirstTokenInLastRep(),
259                 tokenContainer.getFirstTokenInLastIndex(), false);
260         final Extraction extraction =
261             new Extraction(DOMUtils.name(element), details);
262         final ExtractionContainer answerKeys =
263             (ExtractionContainer) context.get(KEY_ANSWERS);
264         answerKeys.add(extraction);
265     }
266 
267     /***
268      * Returns a string representation of this object.
269      * @return a textual representation
270      */
271     public String toString() {
272         return new ToStringBuilder(this)
273             .appendSuper(super.toString())
274             .append("target structure", targetStructure)
275             .toString();
276     }
277 
278 }